structure saas with tools
This commit is contained in:
575
.venv/lib/python3.10/site-packages/vertex_ray/cluster_init.py
Normal file
575
.venv/lib/python3.10/site-packages/vertex_ray/cluster_init.py
Normal file
@@ -0,0 +1,575 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2023 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, List, Optional
|
||||
import warnings
|
||||
|
||||
from google.cloud.aiplatform import initializer
|
||||
from google.cloud.aiplatform import utils
|
||||
from google.cloud.aiplatform.utils import resource_manager_utils
|
||||
from google.cloud.aiplatform_v1beta1.types import persistent_resource_service
|
||||
from google.cloud.aiplatform_v1beta1.types.machine_resources import NfsMount
|
||||
from google.cloud.aiplatform_v1beta1.types.persistent_resource import (
|
||||
PersistentResource,
|
||||
RayLogsSpec,
|
||||
RaySpec,
|
||||
RayMetricSpec,
|
||||
ResourcePool,
|
||||
ResourceRuntimeSpec,
|
||||
ServiceAccountSpec,
|
||||
)
|
||||
from google.cloud.aiplatform_v1beta1.types.service_networking import (
|
||||
PscInterfaceConfig,
|
||||
)
|
||||
from google.cloud.aiplatform.vertex_ray.util import (
|
||||
_gapic_utils,
|
||||
_validation_utils,
|
||||
resources,
|
||||
)
|
||||
|
||||
from google.protobuf import field_mask_pb2 # type: ignore
|
||||
from google.cloud.aiplatform.vertex_ray.util._validation_utils import (
|
||||
_V2_4_WARNING_MESSAGE,
|
||||
_V2_9_WARNING_MESSAGE,
|
||||
)
|
||||
|
||||
|
||||
def create_ray_cluster(
|
||||
head_node_type: Optional[resources.Resources] = resources.Resources(),
|
||||
python_version: Optional[str] = "3.10",
|
||||
ray_version: Optional[str] = "2.42",
|
||||
network: Optional[str] = None,
|
||||
service_account: Optional[str] = None,
|
||||
cluster_name: Optional[str] = None,
|
||||
worker_node_types: Optional[List[resources.Resources]] = [resources.Resources()],
|
||||
custom_images: Optional[resources.NodeImages] = None,
|
||||
enable_metrics_collection: Optional[bool] = True,
|
||||
enable_logging: Optional[bool] = True,
|
||||
psc_interface_config: Optional[resources.PscIConfig] = None,
|
||||
reserved_ip_ranges: Optional[List[str]] = None,
|
||||
nfs_mounts: Optional[List[resources.NfsMount]] = None,
|
||||
labels: Optional[Dict[str, str]] = None,
|
||||
) -> str:
|
||||
"""Create a ray cluster on the Vertex AI.
|
||||
|
||||
Sample usage:
|
||||
|
||||
from vertex_ray import Resources
|
||||
|
||||
head_node_type = Resources(
|
||||
machine_type="n1-standard-8",
|
||||
node_count=1,
|
||||
accelerator_type="NVIDIA_TESLA_T4",
|
||||
accelerator_count=1,
|
||||
custom_image="us-docker.pkg.dev/my-project/ray-cpu-image.2.33:latest", # Optional
|
||||
)
|
||||
|
||||
worker_node_types = [Resources(
|
||||
machine_type="n1-standard-8",
|
||||
node_count=2,
|
||||
accelerator_type="NVIDIA_TESLA_T4",
|
||||
accelerator_count=1,
|
||||
custom_image="us-docker.pkg.dev/my-project/ray-gpu-image.2.33:latest", # Optional
|
||||
)]
|
||||
|
||||
cluster_resource_name = vertex_ray.create_ray_cluster(
|
||||
head_node_type=head_node_type,
|
||||
network="projects/my-project-number/global/networks/my-vpc-name", # Optional
|
||||
service_account="my-service-account@my-project-number.iam.gserviceaccount.com", # Optional
|
||||
cluster_name="my-cluster-name", # Optional
|
||||
worker_node_types=worker_node_types,
|
||||
ray_version="2.33",
|
||||
)
|
||||
|
||||
After a ray cluster is set up, you can call
|
||||
`ray.init(f"vertex_ray://{cluster_resource_name}", runtime_env=...)` without
|
||||
specifying ray cluster address to connect to the cluster. To shut down the
|
||||
cluster you can call `ray.delete_ray_cluster()`.
|
||||
Note: If the active ray cluster has not finished shutting down, you cannot
|
||||
create a new ray cluster with the same cluster_name.
|
||||
|
||||
Args:
|
||||
head_node_type: The head node resource. Resources.node_count must be 1.
|
||||
If not set, default value of Resources() class will be used.
|
||||
python_version: Python version for the ray cluster.
|
||||
ray_version: Ray version for the ray cluster. Default is 2.42.0.
|
||||
network: Virtual private cloud (VPC) network. For Ray Client, VPC
|
||||
peering is required to connect to the Ray Cluster managed in the
|
||||
Vertex API service. For Ray Job API, VPC network is not required
|
||||
because Ray Cluster connection can be accessed through dashboard
|
||||
address.
|
||||
service_account: Service account to be used for running Ray programs on
|
||||
the cluster.
|
||||
cluster_name: This value may be up to 63 characters, and valid
|
||||
characters are `[a-z0-9_-]`. The first character cannot be a number
|
||||
or hyphen.
|
||||
worker_node_types: The list of Resources of the worker nodes. The same
|
||||
Resources object should not appear multiple times in the list.
|
||||
custom_images: The NodeImages which specifies head node and worker nodes
|
||||
images. All the workers will share the same image. If each Resource
|
||||
has a specific custom image, use `Resources.custom_image` for
|
||||
head/worker_node_type(s). Note that configuring `Resources.custom_image`
|
||||
will override `custom_images` here. Allowlist only.
|
||||
enable_metrics_collection: Enable Ray metrics collection for visualization.
|
||||
enable_logging: Enable exporting Ray logs to Cloud Logging.
|
||||
psc_interface_config: PSC-I config.
|
||||
reserved_ip_ranges: A list of names for the reserved IP ranges under
|
||||
the VPC network that can be used for this cluster. If set, we will
|
||||
deploy the cluster within the provided IP ranges. Otherwise, the
|
||||
cluster is deployed to any IP ranges under the provided VPC network.
|
||||
Example: ["vertex-ai-ip-range"].
|
||||
labels:
|
||||
The labels with user-defined metadata to organize Ray cluster.
|
||||
|
||||
Label keys and values can be no longer than 64 characters (Unicode
|
||||
codepoints), can only contain lowercase letters, numeric characters,
|
||||
underscores and dashes. International characters are allowed.
|
||||
|
||||
See https://goo.gl/xmQnxf for more information and examples of labels.
|
||||
|
||||
Returns:
|
||||
The cluster_resource_name of the initiated Ray cluster on Vertex.
|
||||
Raise:
|
||||
ValueError: If the cluster is not created successfully.
|
||||
RuntimeError: If the ray_version is 2.4.
|
||||
"""
|
||||
|
||||
if network is None:
|
||||
logging.info(
|
||||
"[Ray on Vertex]: No VPC network configured. It is required for client connection."
|
||||
)
|
||||
if ray_version == "2.4":
|
||||
raise RuntimeError(_V2_4_WARNING_MESSAGE)
|
||||
if ray_version == "2.9.3":
|
||||
warnings.warn(_V2_9_WARNING_MESSAGE, DeprecationWarning, stacklevel=1)
|
||||
local_ray_verion = _validation_utils.get_local_ray_version()
|
||||
if ray_version != local_ray_verion:
|
||||
if custom_images is None and head_node_type.custom_image is None:
|
||||
install_ray_version = "2.42.0"
|
||||
logging.info(
|
||||
"[Ray on Vertex]: Local runtime has Ray version %s"
|
||||
", but the requested cluster runtime has %s. Please "
|
||||
"ensure that the Ray versions match for client connectivity. You may "
|
||||
'"pip install --user --force-reinstall ray[default]==%s"'
|
||||
" and restart runtime before cluster connection."
|
||||
% (local_ray_verion, ray_version, install_ray_version)
|
||||
)
|
||||
else:
|
||||
logging.info(
|
||||
"[Ray on Vertex]: Local runtime has Ray version %s."
|
||||
"Please ensure that the Ray versions match for client connectivity."
|
||||
% local_ray_verion
|
||||
)
|
||||
|
||||
if cluster_name is None:
|
||||
cluster_name = "ray-cluster-" + utils.timestamped_unique_name()
|
||||
|
||||
if head_node_type:
|
||||
if head_node_type.node_count != 1:
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: For head_node_type, "
|
||||
+ "Resources.node_count must be 1."
|
||||
)
|
||||
if head_node_type.autoscaling_spec is not None:
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: For head_node_type, "
|
||||
+ "Resources.autoscaling_spec must be None."
|
||||
)
|
||||
if (
|
||||
head_node_type.accelerator_type is None
|
||||
and head_node_type.accelerator_count > 0
|
||||
):
|
||||
raise ValueError(
|
||||
"[Ray on Vertex]: accelerator_type must be specified when"
|
||||
+ " accelerator_count is set to a value other than 0."
|
||||
)
|
||||
|
||||
resource_pool_images = {}
|
||||
|
||||
# head node
|
||||
resource_pool_0 = ResourcePool()
|
||||
resource_pool_0.id = "head-node"
|
||||
resource_pool_0.replica_count = head_node_type.node_count
|
||||
resource_pool_0.machine_spec.machine_type = head_node_type.machine_type
|
||||
resource_pool_0.machine_spec.accelerator_count = head_node_type.accelerator_count
|
||||
resource_pool_0.machine_spec.accelerator_type = head_node_type.accelerator_type
|
||||
resource_pool_0.disk_spec.boot_disk_type = head_node_type.boot_disk_type
|
||||
resource_pool_0.disk_spec.boot_disk_size_gb = head_node_type.boot_disk_size_gb
|
||||
|
||||
enable_cuda = True if head_node_type.accelerator_count > 0 else False
|
||||
if head_node_type.custom_image is not None:
|
||||
image_uri = head_node_type.custom_image
|
||||
elif custom_images is None:
|
||||
image_uri = _validation_utils.get_image_uri(
|
||||
ray_version, python_version, enable_cuda
|
||||
)
|
||||
elif custom_images.head is not None and custom_images.worker is not None:
|
||||
image_uri = custom_images.head
|
||||
else:
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: custom_images.head and custom_images.worker must be specified when custom_images is set."
|
||||
)
|
||||
|
||||
resource_pool_images[resource_pool_0.id] = image_uri
|
||||
|
||||
worker_pools = []
|
||||
i = 0
|
||||
if worker_node_types:
|
||||
for worker_node_type in worker_node_types:
|
||||
if (
|
||||
worker_node_type.accelerator_type is None
|
||||
and worker_node_type.accelerator_count > 0
|
||||
):
|
||||
raise ValueError(
|
||||
"[Ray on Vertex]: accelerator_type must be specified when"
|
||||
+ " accelerator_count is set to a value other than 0."
|
||||
)
|
||||
additional_replica_count = resources._check_machine_spec_identical(
|
||||
head_node_type, worker_node_type
|
||||
)
|
||||
if worker_node_type.autoscaling_spec is None:
|
||||
# Worker and head share the same MachineSpec, merge them into the
|
||||
# same ResourcePool
|
||||
resource_pool_0.replica_count = (
|
||||
resource_pool_0.replica_count + additional_replica_count
|
||||
)
|
||||
else:
|
||||
if additional_replica_count > 0:
|
||||
# Autoscaling for single ResourcePool (homogeneous cluster).
|
||||
resource_pool_0.replica_count = None
|
||||
resource_pool_0.autoscaling_spec.min_replica_count = (
|
||||
worker_node_type.autoscaling_spec.min_replica_count
|
||||
)
|
||||
resource_pool_0.autoscaling_spec.max_replica_count = (
|
||||
worker_node_type.autoscaling_spec.max_replica_count
|
||||
)
|
||||
if additional_replica_count == 0:
|
||||
resource_pool = ResourcePool()
|
||||
resource_pool.id = f"worker-pool{i+1}"
|
||||
if worker_node_type.autoscaling_spec is None:
|
||||
resource_pool.replica_count = worker_node_type.node_count
|
||||
else:
|
||||
# Autoscaling for worker ResourcePool.
|
||||
resource_pool.autoscaling_spec.min_replica_count = (
|
||||
worker_node_type.autoscaling_spec.min_replica_count
|
||||
)
|
||||
resource_pool.autoscaling_spec.max_replica_count = (
|
||||
worker_node_type.autoscaling_spec.max_replica_count
|
||||
)
|
||||
resource_pool.machine_spec.machine_type = worker_node_type.machine_type
|
||||
resource_pool.machine_spec.accelerator_count = (
|
||||
worker_node_type.accelerator_count
|
||||
)
|
||||
resource_pool.machine_spec.accelerator_type = (
|
||||
worker_node_type.accelerator_type
|
||||
)
|
||||
resource_pool.disk_spec.boot_disk_type = worker_node_type.boot_disk_type
|
||||
resource_pool.disk_spec.boot_disk_size_gb = (
|
||||
worker_node_type.boot_disk_size_gb
|
||||
)
|
||||
worker_pools.append(resource_pool)
|
||||
enable_cuda = True if worker_node_type.accelerator_count > 0 else False
|
||||
|
||||
if worker_node_type.custom_image is not None:
|
||||
image_uri = worker_node_type.custom_image
|
||||
elif custom_images is None:
|
||||
image_uri = _validation_utils.get_image_uri(
|
||||
ray_version, python_version, enable_cuda
|
||||
)
|
||||
else:
|
||||
image_uri = custom_images.worker
|
||||
|
||||
resource_pool_images[resource_pool.id] = image_uri
|
||||
|
||||
i += 1
|
||||
|
||||
resource_pools = [resource_pool_0] + worker_pools
|
||||
|
||||
metrics_collection_disabled = not enable_metrics_collection
|
||||
ray_metric_spec = RayMetricSpec(disabled=metrics_collection_disabled)
|
||||
|
||||
logging_disabled = not enable_logging
|
||||
ray_logs_spec = RayLogsSpec(disabled=logging_disabled)
|
||||
|
||||
ray_spec = RaySpec(
|
||||
resource_pool_images=resource_pool_images,
|
||||
ray_metric_spec=ray_metric_spec,
|
||||
ray_logs_spec=ray_logs_spec,
|
||||
)
|
||||
if nfs_mounts:
|
||||
gapic_nfs_mounts = []
|
||||
for nfs_mount in nfs_mounts:
|
||||
gapic_nfs_mounts.append(
|
||||
NfsMount(
|
||||
server=nfs_mount.server,
|
||||
path=nfs_mount.path,
|
||||
mount_point=nfs_mount.mount_point,
|
||||
)
|
||||
)
|
||||
ray_spec.nfs_mounts = gapic_nfs_mounts
|
||||
if service_account:
|
||||
service_account_spec = ServiceAccountSpec(
|
||||
enable_custom_service_account=True,
|
||||
service_account=service_account,
|
||||
)
|
||||
resource_runtime_spec = ResourceRuntimeSpec(
|
||||
ray_spec=ray_spec,
|
||||
service_account_spec=service_account_spec,
|
||||
)
|
||||
else:
|
||||
resource_runtime_spec = ResourceRuntimeSpec(ray_spec=ray_spec)
|
||||
if psc_interface_config:
|
||||
gapic_psc_interface_config = PscInterfaceConfig(
|
||||
network_attachment=psc_interface_config.network_attachment,
|
||||
)
|
||||
else:
|
||||
gapic_psc_interface_config = None
|
||||
|
||||
persistent_resource = PersistentResource(
|
||||
resource_pools=resource_pools,
|
||||
network=network,
|
||||
labels=labels,
|
||||
resource_runtime_spec=resource_runtime_spec,
|
||||
psc_interface_config=gapic_psc_interface_config,
|
||||
reserved_ip_ranges=reserved_ip_ranges,
|
||||
)
|
||||
|
||||
location = initializer.global_config.location
|
||||
project_id = initializer.global_config.project
|
||||
project_number = resource_manager_utils.get_project_number(project_id)
|
||||
|
||||
parent = f"projects/{project_number}/locations/{location}"
|
||||
request = persistent_resource_service.CreatePersistentResourceRequest(
|
||||
parent=parent,
|
||||
persistent_resource=persistent_resource,
|
||||
persistent_resource_id=cluster_name,
|
||||
)
|
||||
|
||||
client = _gapic_utils.create_persistent_resource_client()
|
||||
try:
|
||||
_ = client.create_persistent_resource(request)
|
||||
except Exception as e:
|
||||
raise ValueError("Failed in cluster creation due to: ", e) from e
|
||||
|
||||
# Get persisent resource
|
||||
cluster_resource_name = f"{parent}/persistentResources/{cluster_name}"
|
||||
response = _gapic_utils.get_persistent_resource(
|
||||
persistent_resource_name=cluster_resource_name,
|
||||
tolerance=1, # allow 1 retry to avoid get request before creation
|
||||
)
|
||||
return response.name
|
||||
|
||||
|
||||
def delete_ray_cluster(cluster_resource_name: str) -> None:
|
||||
"""Delete Ray Cluster.
|
||||
|
||||
Args:
|
||||
cluster_resource_name: Cluster resource name.
|
||||
Raises:
|
||||
FailedPrecondition: If the cluster is deleted already.
|
||||
"""
|
||||
client = _gapic_utils.create_persistent_resource_client()
|
||||
request = persistent_resource_service.DeletePersistentResourceRequest(
|
||||
name=cluster_resource_name
|
||||
)
|
||||
|
||||
try:
|
||||
client.delete_persistent_resource(request)
|
||||
print("[Ray on Vertex AI]: Successfully deleted the cluster.")
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: Failed in cluster deletion due to: ", e
|
||||
) from e
|
||||
|
||||
|
||||
def get_ray_cluster(cluster_resource_name: str) -> resources.Cluster:
|
||||
"""Get Ray Cluster.
|
||||
|
||||
Args:
|
||||
cluster_resource_name: Cluster resource name.
|
||||
Returns:
|
||||
A Cluster object.
|
||||
"""
|
||||
client = _gapic_utils.create_persistent_resource_client()
|
||||
request = persistent_resource_service.GetPersistentResourceRequest(
|
||||
name=cluster_resource_name
|
||||
)
|
||||
try:
|
||||
response = client.get_persistent_resource(request)
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: Failed in getting the cluster due to: ", e
|
||||
) from e
|
||||
|
||||
cluster = _gapic_utils.persistent_resource_to_cluster(persistent_resource=response)
|
||||
if cluster:
|
||||
return cluster
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: Please delete and recreate the cluster (The cluster is not a Ray cluster or the cluster image is outdated)."
|
||||
)
|
||||
|
||||
|
||||
def list_ray_clusters() -> List[resources.Cluster]:
|
||||
"""List Ray Clusters under the currently authenticated project.
|
||||
|
||||
Returns:
|
||||
List of Cluster objects that exists in the current authorized project.
|
||||
"""
|
||||
location = initializer.global_config.location
|
||||
project_id = initializer.global_config.project
|
||||
project_number = resource_manager_utils.get_project_number(project_id)
|
||||
parent = f"projects/{project_number}/locations/{location}"
|
||||
request = persistent_resource_service.ListPersistentResourcesRequest(
|
||||
parent=parent,
|
||||
)
|
||||
client = _gapic_utils.create_persistent_resource_client()
|
||||
try:
|
||||
response = client.list_persistent_resources(request)
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: Failed in listing the clusters due to: ", e
|
||||
) from e
|
||||
|
||||
ray_clusters = []
|
||||
for persistent_resource in response:
|
||||
ray_cluster = _gapic_utils.persistent_resource_to_cluster(
|
||||
persistent_resource=persistent_resource
|
||||
)
|
||||
if ray_cluster:
|
||||
ray_clusters.append(ray_cluster)
|
||||
|
||||
return ray_clusters
|
||||
|
||||
|
||||
def update_ray_cluster(
|
||||
cluster_resource_name: str, worker_node_types: List[resources.Resources]
|
||||
) -> str:
|
||||
"""Update Ray Cluster (currently support resizing node counts for worker nodes).
|
||||
|
||||
Sample usage:
|
||||
|
||||
my_cluster = vertex_ray.get_ray_cluster(
|
||||
cluster_resource_name=my_existing_cluster_resource_name,
|
||||
)
|
||||
|
||||
# Declaration to resize all the worker_node_type to node_count=1
|
||||
new_worker_node_types = []
|
||||
for worker_node_type in my_cluster.worker_node_types:
|
||||
worker_node_type.node_count = 1
|
||||
new_worker_node_types.append(worker_node_type)
|
||||
|
||||
# Execution to update new node_count (block until complete)
|
||||
vertex_ray.update_ray_cluster(
|
||||
cluster_resource_name=my_cluster.cluster_resource_name,
|
||||
worker_node_types=new_worker_node_types,
|
||||
)
|
||||
|
||||
Args:
|
||||
cluster_resource_name:
|
||||
worker_node_types: The list of Resources of the resized worker nodes.
|
||||
The same Resources object should not appear multiple times in the list.
|
||||
Returns:
|
||||
The cluster_resource_name of the Ray cluster on Vertex.
|
||||
"""
|
||||
# worker_node_types should not be duplicated.
|
||||
for i in range(len(worker_node_types)):
|
||||
for j in range(len(worker_node_types)):
|
||||
additional_replica_count = resources._check_machine_spec_identical(
|
||||
worker_node_types[i], worker_node_types[j]
|
||||
)
|
||||
if additional_replica_count > 0 and i != j:
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: Worker_node_types have duplicate "
|
||||
+ f"machine specs: {worker_node_types[i]} "
|
||||
+ f"and {worker_node_types[j]}"
|
||||
)
|
||||
|
||||
persistent_resource = _gapic_utils.get_persistent_resource(
|
||||
persistent_resource_name=cluster_resource_name
|
||||
)
|
||||
|
||||
current_persistent_resource = copy.deepcopy(persistent_resource)
|
||||
current_persistent_resource.resource_pools[0].replica_count = 1
|
||||
|
||||
previous_ray_cluster = get_ray_cluster(cluster_resource_name)
|
||||
head_node_type = previous_ray_cluster.head_node_type
|
||||
previous_worker_node_types = previous_ray_cluster.worker_node_types
|
||||
|
||||
# new worker_node_types and previous_worker_node_types should be the same length.
|
||||
if len(worker_node_types) != len(previous_worker_node_types):
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: Desired number of worker_node_types "
|
||||
+ "(%i) does not match the number of the "
|
||||
+ "existing worker_node_type(%i).",
|
||||
len(worker_node_types),
|
||||
len(previous_worker_node_types),
|
||||
)
|
||||
|
||||
# Merge worker_node_type and head_node_type if they share
|
||||
# the same machine spec.
|
||||
not_merged = 1
|
||||
for i in range(len(worker_node_types)):
|
||||
additional_replica_count = resources._check_machine_spec_identical(
|
||||
head_node_type, worker_node_types[i]
|
||||
)
|
||||
if additional_replica_count != 0 or (
|
||||
additional_replica_count == 0 and worker_node_types[i].node_count == 0
|
||||
):
|
||||
# Merge the 1st duplicated worker with head, allow scale down to 0 worker
|
||||
current_persistent_resource.resource_pools[0].replica_count = (
|
||||
1 + additional_replica_count
|
||||
)
|
||||
# Reset not_merged
|
||||
not_merged = 0
|
||||
else:
|
||||
# No duplication w/ head node, write the 2nd worker node to the 2nd resource pool.
|
||||
current_persistent_resource.resource_pools[
|
||||
i + not_merged
|
||||
].replica_count = worker_node_types[i].node_count
|
||||
# New worker_node_type.node_count should be >=1 unless the worker_node_type
|
||||
# and head_node_type are merged due to the same machine specs.
|
||||
if worker_node_types[i].node_count == 0:
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: Worker_node_type "
|
||||
+ f"({worker_node_types[i]}) must update to >= 1 nodes",
|
||||
)
|
||||
|
||||
request = persistent_resource_service.UpdatePersistentResourceRequest(
|
||||
persistent_resource=current_persistent_resource,
|
||||
update_mask=field_mask_pb2.FieldMask(paths=["resource_pools.replica_count"]),
|
||||
)
|
||||
client = _gapic_utils.create_persistent_resource_client()
|
||||
try:
|
||||
operation_future = client.update_persistent_resource(request)
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: Failed in updating the cluster due to: ", e
|
||||
) from e
|
||||
|
||||
# block before returning
|
||||
start_time = time.time()
|
||||
response = operation_future.result()
|
||||
duration = (time.time() - start_time) // 60
|
||||
print(
|
||||
"[Ray on Vertex AI]: Successfully updated the cluster ({} mininutes elapsed).".format(
|
||||
duration
|
||||
)
|
||||
)
|
||||
return response.name
|
||||
Reference in New Issue
Block a user