structure saas with tools
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,288 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2023 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from google.api_core import exceptions
|
||||
from google.cloud.aiplatform import initializer
|
||||
from google.cloud.aiplatform.utils import (
|
||||
PersistentResourceClientWithOverride,
|
||||
)
|
||||
from google.cloud.aiplatform.vertex_ray.util import _validation_utils
|
||||
from google.cloud.aiplatform.vertex_ray.util.resources import (
|
||||
AutoscalingSpec,
|
||||
Cluster,
|
||||
PscIConfig,
|
||||
Resources,
|
||||
)
|
||||
from google.cloud.aiplatform_v1beta1.types.persistent_resource import (
|
||||
PersistentResource,
|
||||
)
|
||||
from google.cloud.aiplatform_v1beta1.types.persistent_resource_service import (
|
||||
GetPersistentResourceRequest,
|
||||
)
|
||||
|
||||
|
||||
_PRIVATE_PREVIEW_IMAGE = "-docker.pkg.dev/vertex-ai/training/tf-"
|
||||
_OFFICIAL_IMAGE = "-docker.pkg.dev/vertex-ai/training/ray-"
|
||||
|
||||
|
||||
def create_persistent_resource_client():
|
||||
# location is inhereted from the global configuration at aiplatform.init().
|
||||
return initializer.global_config.create_client(
|
||||
client_class=PersistentResourceClientWithOverride,
|
||||
appended_gapic_version="vertex_ray",
|
||||
).select_version("v1beta1")
|
||||
|
||||
|
||||
def polling_delay(num_attempts: int, time_scale: float) -> datetime.timedelta:
|
||||
"""Computes a delay to the next attempt to poll the Vertex service.
|
||||
|
||||
This does bounded exponential backoff, starting with $time_scale.
|
||||
If $time_scale == 0, it starts with a small time interval, less than
|
||||
1 second.
|
||||
|
||||
Args:
|
||||
num_attempts: The number of times have we polled and found that the
|
||||
desired result was not yet available.
|
||||
time_scale: The shortest polling interval, in seconds, or zero. Zero is
|
||||
treated as a small interval, less than 1 second.
|
||||
|
||||
Returns:
|
||||
A recommended delay interval, in seconds.
|
||||
"""
|
||||
# The polling schedule is slow initially , and then gets faster until 6
|
||||
# attempts (after that the sleeping time remains the same).
|
||||
small_interval = 30.0 # Seconds
|
||||
interval = max(time_scale, small_interval) * 0.765 ** min(num_attempts, 6)
|
||||
return datetime.timedelta(seconds=interval)
|
||||
|
||||
|
||||
def get_persistent_resource(
|
||||
persistent_resource_name: str, tolerance: Optional[int] = 0
|
||||
):
|
||||
"""Get persistent resource.
|
||||
|
||||
Args:
|
||||
persistent_resource_name:
|
||||
"projects/<project_num>/locations/<region>/persistentResources/<pr_id>".
|
||||
tolerance: number of attemps to get persistent resource.
|
||||
|
||||
Returns:
|
||||
aiplatform_v1.PersistentResource if state is RUNNING.
|
||||
|
||||
Raises:
|
||||
ValueError: Invalid cluster resource name.
|
||||
RuntimeError: Service returns error.
|
||||
RuntimeError: Cluster resource state is STOPPING.
|
||||
RuntimeError: Cluster resource state is ERROR.
|
||||
"""
|
||||
|
||||
client = create_persistent_resource_client()
|
||||
request = GetPersistentResourceRequest(name=persistent_resource_name)
|
||||
|
||||
# TODO(b/277117901): Add test cases for polling and error handling
|
||||
num_attempts = 0
|
||||
while True:
|
||||
try:
|
||||
response = client.get_persistent_resource(request)
|
||||
except exceptions.NotFound:
|
||||
response = None
|
||||
if num_attempts >= tolerance:
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: Invalid cluster_resource_name (404 not found)."
|
||||
)
|
||||
if response:
|
||||
if response.error.message:
|
||||
logging.error("[Ray on Vertex AI]: %s" % response.error.message)
|
||||
raise RuntimeError("[Ray on Vertex AI]: Cluster returned an error.")
|
||||
|
||||
print("[Ray on Vertex AI]: Cluster State =", response.state)
|
||||
if response.state == PersistentResource.State.RUNNING:
|
||||
return response
|
||||
elif response.state == PersistentResource.State.STOPPING:
|
||||
raise RuntimeError("[Ray on Vertex AI]: The cluster is stopping.")
|
||||
elif response.state == PersistentResource.State.ERROR:
|
||||
raise RuntimeError(
|
||||
"[Ray on Vertex AI]: The cluster encountered an error."
|
||||
)
|
||||
# Polling decay
|
||||
sleep_time = polling_delay(num_attempts=num_attempts, time_scale=150.0)
|
||||
num_attempts += 1
|
||||
print(
|
||||
"Waiting for cluster provisioning; attempt {}; sleeping for {} seconds".format(
|
||||
num_attempts, sleep_time
|
||||
)
|
||||
)
|
||||
time.sleep(sleep_time.total_seconds())
|
||||
|
||||
|
||||
def persistent_resource_to_cluster(
|
||||
persistent_resource: PersistentResource,
|
||||
) -> Optional[Cluster]:
|
||||
"""Format a PersistentResource to a dictionary.
|
||||
|
||||
Args:
|
||||
persistent_resource: PersistentResource.
|
||||
Returns:
|
||||
Cluster.
|
||||
"""
|
||||
dashboard_address = persistent_resource.resource_runtime.access_uris.get(
|
||||
"RAY_DASHBOARD_URI"
|
||||
)
|
||||
cluster = Cluster(
|
||||
cluster_resource_name=persistent_resource.name,
|
||||
network=persistent_resource.network,
|
||||
reserved_ip_ranges=persistent_resource.reserved_ip_ranges,
|
||||
state=persistent_resource.state.name,
|
||||
labels=persistent_resource.labels,
|
||||
dashboard_address=dashboard_address,
|
||||
)
|
||||
if not persistent_resource.resource_runtime_spec.ray_spec:
|
||||
# skip PersistentResource without RaySpec
|
||||
logging.info(
|
||||
"[Ray on Vertex AI]: Cluster %s does not have Ray installed."
|
||||
% persistent_resource.name,
|
||||
)
|
||||
return
|
||||
if persistent_resource.psc_interface_config:
|
||||
cluster.psc_interface_config = PscIConfig(
|
||||
network_attachment=persistent_resource.psc_interface_config.network_attachment
|
||||
)
|
||||
resource_pools = persistent_resource.resource_pools
|
||||
|
||||
head_resource_pool = resource_pools[0]
|
||||
head_id = head_resource_pool.id
|
||||
head_image_uri = (
|
||||
persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[head_id]
|
||||
)
|
||||
if persistent_resource.resource_runtime_spec.service_account_spec.service_account:
|
||||
cluster.service_account = (
|
||||
persistent_resource.resource_runtime_spec.service_account_spec.service_account
|
||||
)
|
||||
if not head_image_uri:
|
||||
head_image_uri = persistent_resource.resource_runtime_spec.ray_spec.image_uri
|
||||
|
||||
try:
|
||||
python_version, ray_version = _validation_utils.get_versions_from_image_uri(
|
||||
head_image_uri
|
||||
)
|
||||
except IndexError:
|
||||
if _PRIVATE_PREVIEW_IMAGE in head_image_uri:
|
||||
# If using outdated images
|
||||
logging.info(
|
||||
"[Ray on Vertex AI]: The image of cluster %s is outdated."
|
||||
" It is recommended to delete and recreate the cluster to obtain"
|
||||
" the latest image." % persistent_resource.name
|
||||
)
|
||||
return None
|
||||
else:
|
||||
# Custom image might also cause IndexError
|
||||
python_version = None
|
||||
ray_version = None
|
||||
cluster.python_version = python_version
|
||||
cluster.ray_version = ray_version
|
||||
cluster.ray_metric_enabled = not (
|
||||
persistent_resource.resource_runtime_spec.ray_spec.ray_metric_spec.disabled
|
||||
)
|
||||
cluster.ray_logs_enabled = not (
|
||||
persistent_resource.resource_runtime_spec.ray_spec.ray_logs_spec.disabled
|
||||
)
|
||||
|
||||
accelerator_type = head_resource_pool.machine_spec.accelerator_type
|
||||
if accelerator_type.value != 0:
|
||||
accelerator_type = accelerator_type.name
|
||||
else:
|
||||
accelerator_type = None
|
||||
if _OFFICIAL_IMAGE in head_image_uri:
|
||||
# Official training image is not custom
|
||||
head_image_uri = None
|
||||
head_node_type = Resources(
|
||||
machine_type=head_resource_pool.machine_spec.machine_type,
|
||||
accelerator_type=accelerator_type,
|
||||
accelerator_count=head_resource_pool.machine_spec.accelerator_count,
|
||||
boot_disk_type=head_resource_pool.disk_spec.boot_disk_type,
|
||||
boot_disk_size_gb=head_resource_pool.disk_spec.boot_disk_size_gb,
|
||||
node_count=1,
|
||||
custom_image=head_image_uri,
|
||||
)
|
||||
worker_node_types = []
|
||||
if head_resource_pool.replica_count > 1:
|
||||
# head_node_type.node_count must be 1. If the head_resource_pool (the first
|
||||
# resource pool) has replica_count > 1, the rest replica are worker nodes.
|
||||
worker_node_count = head_resource_pool.replica_count - 1
|
||||
worker_node_types.append(
|
||||
Resources(
|
||||
machine_type=head_resource_pool.machine_spec.machine_type,
|
||||
accelerator_type=accelerator_type,
|
||||
accelerator_count=head_resource_pool.machine_spec.accelerator_count,
|
||||
boot_disk_type=head_resource_pool.disk_spec.boot_disk_type,
|
||||
boot_disk_size_gb=head_resource_pool.disk_spec.boot_disk_size_gb,
|
||||
node_count=worker_node_count,
|
||||
custom_image=head_image_uri,
|
||||
)
|
||||
)
|
||||
if head_resource_pool.autoscaling_spec:
|
||||
worker_node_types[0].autoscaling_spec = AutoscalingSpec(
|
||||
min_replica_count=head_resource_pool.autoscaling_spec.min_replica_count,
|
||||
max_replica_count=head_resource_pool.autoscaling_spec.max_replica_count,
|
||||
)
|
||||
for i in range(len(resource_pools) - 1):
|
||||
# Convert the second and more resource pools to vertex_ray.Resources,
|
||||
# and append then to worker_node_types.
|
||||
accelerator_type = resource_pools[i + 1].machine_spec.accelerator_type
|
||||
if accelerator_type.value != 0:
|
||||
accelerator_type = accelerator_type.name
|
||||
else:
|
||||
accelerator_type = None
|
||||
worker_image_uri = (
|
||||
persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
|
||||
resource_pools[i + 1].id
|
||||
]
|
||||
)
|
||||
if _OFFICIAL_IMAGE in worker_image_uri:
|
||||
# Official training image is not custom
|
||||
worker_image_uri = None
|
||||
|
||||
resource = Resources(
|
||||
machine_type=resource_pools[i + 1].machine_spec.machine_type,
|
||||
accelerator_type=accelerator_type,
|
||||
accelerator_count=resource_pools[i + 1].machine_spec.accelerator_count,
|
||||
boot_disk_type=resource_pools[i + 1].disk_spec.boot_disk_type,
|
||||
boot_disk_size_gb=resource_pools[i + 1].disk_spec.boot_disk_size_gb,
|
||||
node_count=resource_pools[i + 1].replica_count,
|
||||
custom_image=worker_image_uri,
|
||||
)
|
||||
if resource_pools[i + 1].autoscaling_spec:
|
||||
resource.autoscaling_spec = AutoscalingSpec(
|
||||
min_replica_count=resource_pools[
|
||||
i + 1
|
||||
].autoscaling_spec.min_replica_count,
|
||||
max_replica_count=resource_pools[
|
||||
i + 1
|
||||
].autoscaling_spec.max_replica_count,
|
||||
)
|
||||
|
||||
worker_node_types.append(resource)
|
||||
|
||||
cluster.head_node_type = head_node_type
|
||||
cluster.worker_node_types = worker_node_types
|
||||
|
||||
return cluster
|
||||
@@ -0,0 +1,167 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import google.auth
|
||||
import google.auth.transport.requests
|
||||
import logging
|
||||
import ray
|
||||
import re
|
||||
from immutabledict import immutabledict
|
||||
|
||||
from google.cloud.aiplatform import initializer
|
||||
from google.cloud.aiplatform.utils import resource_manager_utils
|
||||
|
||||
SUPPORTED_RAY_VERSIONS = immutabledict(
|
||||
{"2.9": "2.9.3", "2.33": "2.33.0", "2.42": "2.42.0"}
|
||||
)
|
||||
SUPPORTED_RAY_VERSIONS_FROM_PYTHON_VERSIONS = immutabledict(
|
||||
{
|
||||
"3.10": ("2.9", "2.33", "2.42"),
|
||||
"3.11": ("2.42"),
|
||||
}
|
||||
)
|
||||
_V2_4_WARNING_MESSAGE = (
|
||||
"After google-cloud-aiplatform>1.53.0, using Ray version = 2.4 will result"
|
||||
" in an error. Please use Ray version = 2.33.0 or 2.42.0 (default) instead."
|
||||
)
|
||||
_V2_9_WARNING_MESSAGE = (
|
||||
"In March 2025, using Ray version = 2.9 will result in an error. "
|
||||
"Please use Ray version = 2.33.0 or 2.42.0 (default) instead."
|
||||
)
|
||||
|
||||
|
||||
# Artifact Repository available regions.
|
||||
_AVAILABLE_REGIONS = ["us", "europe", "asia"]
|
||||
# If region is not available, assume using the default region.
|
||||
_DEFAULT_REGION = "us"
|
||||
|
||||
_PERSISTENT_RESOURCE_NAME_PATTERN = "projects/{}/locations/{}/persistentResources/{}"
|
||||
_VALID_RESOURCE_NAME_REGEX = "[a-z][a-zA-Z0-9._-]{0,127}"
|
||||
_DASHBOARD_URI_SUFFIX = "aiplatform-training.googleusercontent.com"
|
||||
|
||||
|
||||
def valid_resource_name(resource_name):
|
||||
"""Check if address is a valid resource name."""
|
||||
resource_name_split = resource_name.split("/")
|
||||
if not (
|
||||
len(resource_name_split) == 6
|
||||
and resource_name_split[0] == "projects"
|
||||
and resource_name_split[2] == "locations"
|
||||
and resource_name_split[4] == "persistentResources"
|
||||
):
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: Address must be in the following "
|
||||
"format: vertex_ray://projects/<project_num>/locations/<region>/persistentResources/<pr_id> "
|
||||
"or vertex_ray://<pr_id>."
|
||||
)
|
||||
|
||||
|
||||
def maybe_reconstruct_resource_name(address) -> str:
|
||||
"""Reconstruct full persistent resource name if only id was given."""
|
||||
if re.match("^{}$".format(_VALID_RESOURCE_NAME_REGEX), address):
|
||||
# Assume only cluster name (persistent resource id) was given.
|
||||
logging.info(
|
||||
"[Ray on Vertex AI]: Cluster name was given as address, reconstructing full resource name"
|
||||
)
|
||||
return _PERSISTENT_RESOURCE_NAME_PATTERN.format(
|
||||
resource_manager_utils.get_project_number(
|
||||
initializer.global_config.project
|
||||
),
|
||||
initializer.global_config.location,
|
||||
address,
|
||||
)
|
||||
|
||||
return address
|
||||
|
||||
|
||||
def get_local_ray_version():
|
||||
ray_version = ray.__version__.split(".")
|
||||
if len(ray_version) == 3:
|
||||
ray_version = ray_version[:2]
|
||||
return ".".join(ray_version)
|
||||
|
||||
|
||||
def get_image_uri(ray_version, python_version, enable_cuda):
|
||||
"""Image uri for a given ray version and python version."""
|
||||
if ray_version not in SUPPORTED_RAY_VERSIONS:
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: The supported Ray versions are %s (%s) and %s (%s)."
|
||||
% (
|
||||
list(SUPPORTED_RAY_VERSIONS.keys())[0],
|
||||
list(SUPPORTED_RAY_VERSIONS.values())[0],
|
||||
list(SUPPORTED_RAY_VERSIONS.keys())[1],
|
||||
list(SUPPORTED_RAY_VERSIONS.values())[1],
|
||||
)
|
||||
)
|
||||
if python_version not in SUPPORTED_RAY_VERSIONS_FROM_PYTHON_VERSIONS:
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: The supported Python versions are 3.10 or 3.11."
|
||||
)
|
||||
|
||||
if ray_version not in SUPPORTED_RAY_VERSIONS_FROM_PYTHON_VERSIONS[python_version]:
|
||||
raise ValueError(
|
||||
"[Ray on Vertex AI]: The supported Ray version(s) for Python version %s: %s."
|
||||
% (
|
||||
python_version,
|
||||
SUPPORTED_RAY_VERSIONS_FROM_PYTHON_VERSIONS[python_version],
|
||||
)
|
||||
)
|
||||
|
||||
location = initializer.global_config.location
|
||||
region = location.split("-")[0]
|
||||
if region not in _AVAILABLE_REGIONS:
|
||||
region = _DEFAULT_REGION
|
||||
ray_version = ray_version.replace(".", "-")
|
||||
python_version = python_version.replace(".", "")
|
||||
if enable_cuda:
|
||||
return f"{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.{ray_version}.py{python_version}:latest"
|
||||
else:
|
||||
return f"{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.{ray_version}.py{python_version}:latest"
|
||||
|
||||
|
||||
def get_versions_from_image_uri(image_uri):
|
||||
"""Get ray version and python version from image uri."""
|
||||
logging.info(f"[Ray on Vertex AI]: Getting versions from image uri: {image_uri}")
|
||||
image_label = image_uri.split("/")[-1].split(":")[0]
|
||||
py_version = image_label[-3] + "." + image_label[-2:]
|
||||
ray_version = image_label.split(".")[1].replace("-", ".")
|
||||
if (
|
||||
py_version in SUPPORTED_RAY_VERSIONS_FROM_PYTHON_VERSIONS
|
||||
and ray_version in SUPPORTED_RAY_VERSIONS_FROM_PYTHON_VERSIONS[py_version]
|
||||
):
|
||||
return py_version, ray_version
|
||||
else:
|
||||
# May not parse custom image and get the versions correctly
|
||||
return None, None
|
||||
|
||||
|
||||
def valid_dashboard_address(address):
|
||||
"""Check if address is a valid dashboard uri."""
|
||||
return address.endswith(_DASHBOARD_URI_SUFFIX)
|
||||
|
||||
|
||||
def get_bearer_token():
|
||||
"""Get bearer token through Application Default Credentials."""
|
||||
creds, _ = google.auth.default(
|
||||
scopes=["https://www.googleapis.com/auth/cloud-platform"]
|
||||
)
|
||||
|
||||
# creds.valid is False, and creds.token is None
|
||||
# Need to refresh credentials to populate those
|
||||
auth_req = google.auth.transport.requests.Request()
|
||||
creds.refresh(auth_req)
|
||||
return creds.token
|
||||
217
.venv/lib/python3.10/site-packages/vertex_ray/util/resources.py
Normal file
217
.venv/lib/python3.10/site-packages/vertex_ray/util/resources.py
Normal file
@@ -0,0 +1,217 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2023 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import dataclasses
|
||||
from typing import Dict, List, Optional
|
||||
from google.cloud.aiplatform_v1beta1.types import PersistentResource
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class AutoscalingSpec:
|
||||
"""Autoscaling spec for a ray cluster node.
|
||||
|
||||
Attributes:
|
||||
min_replica_count: The minimum number of replicas in the cluster.
|
||||
max_replica_count: The maximum number of replicas in the cluster.
|
||||
"""
|
||||
|
||||
min_replica_count: int = 1
|
||||
max_replica_count: int = 2
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Resources:
|
||||
"""Resources for a ray cluster node.
|
||||
|
||||
Attributes:
|
||||
machine_type: See the list of machine types:
|
||||
https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types
|
||||
node_count: This argument represents how many nodes to start for the
|
||||
ray cluster.
|
||||
accelerator_type: e.g. "NVIDIA_TESLA_P4".
|
||||
Vertex AI supports the following types of GPU:
|
||||
https://cloud.google.com/vertex-ai/docs/training/configure-compute#specifying_gpus
|
||||
accelerator_count: The number of accelerators to attach to the machine.
|
||||
boot_disk_type: Type of the boot disk (default is "pd-ssd").
|
||||
Valid values: "pd-ssd" (Persistent Disk Solid State Drive) or
|
||||
"pd-standard" (Persistent Disk Hard Disk Drive).
|
||||
boot_disk_size_gb: Size in GB of the boot disk (default is 100GB). Must
|
||||
be either unspecified or within the range of [100, 64000].
|
||||
custom_image: Custom image for this resource (e.g.
|
||||
us-docker.pkg.dev/my-project/ray-gpu.2-9.py310-tf:latest).
|
||||
autoscaling_spec: Autoscaling spec for this resource.
|
||||
"""
|
||||
|
||||
machine_type: Optional[str] = "n1-standard-16"
|
||||
node_count: Optional[int] = 1
|
||||
accelerator_type: Optional[str] = None
|
||||
accelerator_count: Optional[int] = 0
|
||||
boot_disk_type: Optional[str] = "pd-ssd"
|
||||
boot_disk_size_gb: Optional[int] = 100
|
||||
custom_image: Optional[str] = None
|
||||
autoscaling_spec: Optional[AutoscalingSpec] = None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class NodeImages:
|
||||
"""Custom images for a ray cluster.
|
||||
|
||||
We currently support Ray v2.9, v2.33, v2.42 and python v3.10.
|
||||
We also support python v3.11 for Ray v2.42.
|
||||
The custom images must be extended from the following base images:
|
||||
"{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-9.py310:latest",
|
||||
"{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-9.py310:latest",
|
||||
"{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-33.py310:latest",
|
||||
"{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-33.py310:latest",
|
||||
"{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-42.py310:latest",
|
||||
"{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-42.py310:latest",
|
||||
"{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-42.py311:latest", or
|
||||
"{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-42.py311:latest". In
|
||||
order to use custom images, need to specify both head and worker images.
|
||||
|
||||
Attributes:
|
||||
head: image for head node (eg. us-docker.pkg.dev/my-project/ray-cpu.2-33.py310-tf:latest).
|
||||
worker: image for all worker nodes (eg. us-docker.pkg.dev/my-project/ray-gpu.2-33.py310-tf:latest).
|
||||
"""
|
||||
|
||||
head: str = None
|
||||
worker: str = None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class PscIConfig:
|
||||
"""PSC-I config.
|
||||
|
||||
Attributes:
|
||||
network_attachment: Optional. The name or full name of the Compute Engine
|
||||
`network attachment <https://cloud.google.com/vpc/docs/about-network-attachments>`
|
||||
to attach to the resource. It has a format:
|
||||
``projects/{project}/regions/{region}/networkAttachments/{networkAttachment}``.
|
||||
Where {project} is a project number, as in ``12345``, and
|
||||
{networkAttachment} is a network attachment name. To specify
|
||||
this field, you must have already [created a network
|
||||
attachment]
|
||||
(https://cloud.google.com/vpc/docs/create-manage-network-attachments#create-network-attachments).
|
||||
This field is only used for resources using PSC-I. Make sure you do not
|
||||
specify the network here for VPC peering.
|
||||
"""
|
||||
|
||||
network_attachment: str = None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class NfsMount:
|
||||
"""NFS mount.
|
||||
|
||||
Attributes:
|
||||
server: Required. IP address of the NFS server.
|
||||
path: Required. Source path exported from NFS server. Has to start
|
||||
with '/', and combined with the ip address, it indicates the
|
||||
source mount path in the form of ``server:path``.
|
||||
mount_point: Required. Destination mount path. The NFS will be mounted
|
||||
for the user under /mnt/nfs/<mount_point>.
|
||||
"""
|
||||
|
||||
server: str = None
|
||||
path: str = None
|
||||
mount_point: str = None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Cluster:
|
||||
"""Ray cluster (output only).
|
||||
|
||||
Attributes:
|
||||
cluster_resource_name: It has a format:
|
||||
"projects/<project_num>/locations/<region>/persistentResources/<pr_id>".
|
||||
network: Virtual private cloud (VPC) network. It has a format:
|
||||
"projects/<project_num>/global/networks/<network_name>".
|
||||
For Ray Client, VPC peering is required to connect to the cluster
|
||||
managed in the Vertex API service. For Ray Job API, VPC network is
|
||||
not required because cluster connection can be accessed through
|
||||
dashboard address.
|
||||
reserved_ip_ranges: A list of names for the reserved IP ranges under
|
||||
the VPC network that can be used for this cluster. If set, we will
|
||||
deploy the cluster within the provided IP ranges. Otherwise, the
|
||||
cluster is deployed to any IP ranges under the provided VPC network.
|
||||
Example: ["vertex-ai-ip-range"].
|
||||
service_account: Service account to be used for running Ray programs on
|
||||
the cluster.
|
||||
state: Describes the cluster state (defined in PersistentResource.State).
|
||||
python_version: Python version for the ray cluster (e.g. "3.10").
|
||||
ray_version: Ray version for the ray cluster (e.g. "2.33").
|
||||
head_node_type: The head node resource. Resources.node_count must be 1.
|
||||
If not set, by default it is a CPU node with machine_type of n1-standard-8.
|
||||
worker_node_types: The list of Resources of the worker nodes. Should not
|
||||
duplicate the elements in the list.
|
||||
dashboard_address: For Ray Job API (JobSubmissionClient), with this
|
||||
cluster connection doesn't require VPC peering.
|
||||
labels:
|
||||
The labels with user-defined metadata to organize Ray cluster.
|
||||
|
||||
Label keys and values can be no longer than 64 characters (Unicode
|
||||
codepoints), can only contain lowercase letters, numeric characters,
|
||||
underscores and dashes. International characters are allowed.
|
||||
|
||||
See https://goo.gl/xmQnxf for more information and examples of labels.
|
||||
"""
|
||||
|
||||
cluster_resource_name: str = None
|
||||
network: str = None
|
||||
reserved_ip_ranges: List[str] = None
|
||||
service_account: str = None
|
||||
state: PersistentResource.State = None
|
||||
python_version: str = None
|
||||
ray_version: str = None
|
||||
head_node_type: Resources = None
|
||||
worker_node_types: List[Resources] = None
|
||||
dashboard_address: str = None
|
||||
ray_metric_enabled: bool = True
|
||||
ray_logs_enabled: bool = True
|
||||
psc_interface_config: PscIConfig = None
|
||||
labels: Dict[str, str] = None
|
||||
|
||||
|
||||
def _check_machine_spec_identical(
|
||||
node_type_1: Resources,
|
||||
node_type_2: Resources,
|
||||
) -> int:
|
||||
"""Check if node_type_1 and node_type_2 have the same machine_spec.
|
||||
If they are identical, return additional_replica_count."""
|
||||
additional_replica_count = 0
|
||||
|
||||
# Check if machine_spec are the same
|
||||
if (
|
||||
node_type_1.machine_type == node_type_2.machine_type
|
||||
and node_type_1.accelerator_type == node_type_2.accelerator_type
|
||||
and node_type_1.accelerator_count == node_type_2.accelerator_count
|
||||
):
|
||||
if node_type_1.boot_disk_type != node_type_2.boot_disk_type:
|
||||
raise ValueError(
|
||||
"Worker disk type must match the head node's disk type if"
|
||||
" sharing the same machine_type, accelerator_type, and"
|
||||
" accelerator_count"
|
||||
)
|
||||
if node_type_1.boot_disk_size_gb != node_type_2.boot_disk_size_gb:
|
||||
raise ValueError(
|
||||
"Worker disk size must match the head node's disk size if"
|
||||
" sharing the same machine_type, accelerator_type, and"
|
||||
" accelerator_count"
|
||||
)
|
||||
additional_replica_count = node_type_2.node_count
|
||||
return additional_replica_count
|
||||
|
||||
return additional_replica_count
|
||||
Reference in New Issue
Block a user