structure saas with tools

This commit is contained in:
Davidson Gomes
2025-04-25 15:30:54 -03:00
commit 1aef473937
16434 changed files with 6584257 additions and 0 deletions

View File

@@ -0,0 +1,288 @@
# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import datetime
import logging
import time
from typing import Optional
from google.api_core import exceptions
from google.cloud.aiplatform import initializer
from google.cloud.aiplatform.utils import (
PersistentResourceClientWithOverride,
)
from google.cloud.aiplatform.vertex_ray.util import _validation_utils
from google.cloud.aiplatform.vertex_ray.util.resources import (
AutoscalingSpec,
Cluster,
PscIConfig,
Resources,
)
from google.cloud.aiplatform_v1beta1.types.persistent_resource import (
PersistentResource,
)
from google.cloud.aiplatform_v1beta1.types.persistent_resource_service import (
GetPersistentResourceRequest,
)
_PRIVATE_PREVIEW_IMAGE = "-docker.pkg.dev/vertex-ai/training/tf-"
_OFFICIAL_IMAGE = "-docker.pkg.dev/vertex-ai/training/ray-"
def create_persistent_resource_client():
# location is inhereted from the global configuration at aiplatform.init().
return initializer.global_config.create_client(
client_class=PersistentResourceClientWithOverride,
appended_gapic_version="vertex_ray",
).select_version("v1beta1")
def polling_delay(num_attempts: int, time_scale: float) -> datetime.timedelta:
"""Computes a delay to the next attempt to poll the Vertex service.
This does bounded exponential backoff, starting with $time_scale.
If $time_scale == 0, it starts with a small time interval, less than
1 second.
Args:
num_attempts: The number of times have we polled and found that the
desired result was not yet available.
time_scale: The shortest polling interval, in seconds, or zero. Zero is
treated as a small interval, less than 1 second.
Returns:
A recommended delay interval, in seconds.
"""
# The polling schedule is slow initially , and then gets faster until 6
# attempts (after that the sleeping time remains the same).
small_interval = 30.0 # Seconds
interval = max(time_scale, small_interval) * 0.765 ** min(num_attempts, 6)
return datetime.timedelta(seconds=interval)
def get_persistent_resource(
persistent_resource_name: str, tolerance: Optional[int] = 0
):
"""Get persistent resource.
Args:
persistent_resource_name:
"projects/<project_num>/locations/<region>/persistentResources/<pr_id>".
tolerance: number of attemps to get persistent resource.
Returns:
aiplatform_v1.PersistentResource if state is RUNNING.
Raises:
ValueError: Invalid cluster resource name.
RuntimeError: Service returns error.
RuntimeError: Cluster resource state is STOPPING.
RuntimeError: Cluster resource state is ERROR.
"""
client = create_persistent_resource_client()
request = GetPersistentResourceRequest(name=persistent_resource_name)
# TODO(b/277117901): Add test cases for polling and error handling
num_attempts = 0
while True:
try:
response = client.get_persistent_resource(request)
except exceptions.NotFound:
response = None
if num_attempts >= tolerance:
raise ValueError(
"[Ray on Vertex AI]: Invalid cluster_resource_name (404 not found)."
)
if response:
if response.error.message:
logging.error("[Ray on Vertex AI]: %s" % response.error.message)
raise RuntimeError("[Ray on Vertex AI]: Cluster returned an error.")
print("[Ray on Vertex AI]: Cluster State =", response.state)
if response.state == PersistentResource.State.RUNNING:
return response
elif response.state == PersistentResource.State.STOPPING:
raise RuntimeError("[Ray on Vertex AI]: The cluster is stopping.")
elif response.state == PersistentResource.State.ERROR:
raise RuntimeError(
"[Ray on Vertex AI]: The cluster encountered an error."
)
# Polling decay
sleep_time = polling_delay(num_attempts=num_attempts, time_scale=150.0)
num_attempts += 1
print(
"Waiting for cluster provisioning; attempt {}; sleeping for {} seconds".format(
num_attempts, sleep_time
)
)
time.sleep(sleep_time.total_seconds())
def persistent_resource_to_cluster(
persistent_resource: PersistentResource,
) -> Optional[Cluster]:
"""Format a PersistentResource to a dictionary.
Args:
persistent_resource: PersistentResource.
Returns:
Cluster.
"""
dashboard_address = persistent_resource.resource_runtime.access_uris.get(
"RAY_DASHBOARD_URI"
)
cluster = Cluster(
cluster_resource_name=persistent_resource.name,
network=persistent_resource.network,
reserved_ip_ranges=persistent_resource.reserved_ip_ranges,
state=persistent_resource.state.name,
labels=persistent_resource.labels,
dashboard_address=dashboard_address,
)
if not persistent_resource.resource_runtime_spec.ray_spec:
# skip PersistentResource without RaySpec
logging.info(
"[Ray on Vertex AI]: Cluster %s does not have Ray installed."
% persistent_resource.name,
)
return
if persistent_resource.psc_interface_config:
cluster.psc_interface_config = PscIConfig(
network_attachment=persistent_resource.psc_interface_config.network_attachment
)
resource_pools = persistent_resource.resource_pools
head_resource_pool = resource_pools[0]
head_id = head_resource_pool.id
head_image_uri = (
persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[head_id]
)
if persistent_resource.resource_runtime_spec.service_account_spec.service_account:
cluster.service_account = (
persistent_resource.resource_runtime_spec.service_account_spec.service_account
)
if not head_image_uri:
head_image_uri = persistent_resource.resource_runtime_spec.ray_spec.image_uri
try:
python_version, ray_version = _validation_utils.get_versions_from_image_uri(
head_image_uri
)
except IndexError:
if _PRIVATE_PREVIEW_IMAGE in head_image_uri:
# If using outdated images
logging.info(
"[Ray on Vertex AI]: The image of cluster %s is outdated."
" It is recommended to delete and recreate the cluster to obtain"
" the latest image." % persistent_resource.name
)
return None
else:
# Custom image might also cause IndexError
python_version = None
ray_version = None
cluster.python_version = python_version
cluster.ray_version = ray_version
cluster.ray_metric_enabled = not (
persistent_resource.resource_runtime_spec.ray_spec.ray_metric_spec.disabled
)
cluster.ray_logs_enabled = not (
persistent_resource.resource_runtime_spec.ray_spec.ray_logs_spec.disabled
)
accelerator_type = head_resource_pool.machine_spec.accelerator_type
if accelerator_type.value != 0:
accelerator_type = accelerator_type.name
else:
accelerator_type = None
if _OFFICIAL_IMAGE in head_image_uri:
# Official training image is not custom
head_image_uri = None
head_node_type = Resources(
machine_type=head_resource_pool.machine_spec.machine_type,
accelerator_type=accelerator_type,
accelerator_count=head_resource_pool.machine_spec.accelerator_count,
boot_disk_type=head_resource_pool.disk_spec.boot_disk_type,
boot_disk_size_gb=head_resource_pool.disk_spec.boot_disk_size_gb,
node_count=1,
custom_image=head_image_uri,
)
worker_node_types = []
if head_resource_pool.replica_count > 1:
# head_node_type.node_count must be 1. If the head_resource_pool (the first
# resource pool) has replica_count > 1, the rest replica are worker nodes.
worker_node_count = head_resource_pool.replica_count - 1
worker_node_types.append(
Resources(
machine_type=head_resource_pool.machine_spec.machine_type,
accelerator_type=accelerator_type,
accelerator_count=head_resource_pool.machine_spec.accelerator_count,
boot_disk_type=head_resource_pool.disk_spec.boot_disk_type,
boot_disk_size_gb=head_resource_pool.disk_spec.boot_disk_size_gb,
node_count=worker_node_count,
custom_image=head_image_uri,
)
)
if head_resource_pool.autoscaling_spec:
worker_node_types[0].autoscaling_spec = AutoscalingSpec(
min_replica_count=head_resource_pool.autoscaling_spec.min_replica_count,
max_replica_count=head_resource_pool.autoscaling_spec.max_replica_count,
)
for i in range(len(resource_pools) - 1):
# Convert the second and more resource pools to vertex_ray.Resources,
# and append then to worker_node_types.
accelerator_type = resource_pools[i + 1].machine_spec.accelerator_type
if accelerator_type.value != 0:
accelerator_type = accelerator_type.name
else:
accelerator_type = None
worker_image_uri = (
persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
resource_pools[i + 1].id
]
)
if _OFFICIAL_IMAGE in worker_image_uri:
# Official training image is not custom
worker_image_uri = None
resource = Resources(
machine_type=resource_pools[i + 1].machine_spec.machine_type,
accelerator_type=accelerator_type,
accelerator_count=resource_pools[i + 1].machine_spec.accelerator_count,
boot_disk_type=resource_pools[i + 1].disk_spec.boot_disk_type,
boot_disk_size_gb=resource_pools[i + 1].disk_spec.boot_disk_size_gb,
node_count=resource_pools[i + 1].replica_count,
custom_image=worker_image_uri,
)
if resource_pools[i + 1].autoscaling_spec:
resource.autoscaling_spec = AutoscalingSpec(
min_replica_count=resource_pools[
i + 1
].autoscaling_spec.min_replica_count,
max_replica_count=resource_pools[
i + 1
].autoscaling_spec.max_replica_count,
)
worker_node_types.append(resource)
cluster.head_node_type = head_node_type
cluster.worker_node_types = worker_node_types
return cluster

View File

@@ -0,0 +1,167 @@
# -*- coding: utf-8 -*-
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import google.auth
import google.auth.transport.requests
import logging
import ray
import re
from immutabledict import immutabledict
from google.cloud.aiplatform import initializer
from google.cloud.aiplatform.utils import resource_manager_utils
SUPPORTED_RAY_VERSIONS = immutabledict(
{"2.9": "2.9.3", "2.33": "2.33.0", "2.42": "2.42.0"}
)
SUPPORTED_RAY_VERSIONS_FROM_PYTHON_VERSIONS = immutabledict(
{
"3.10": ("2.9", "2.33", "2.42"),
"3.11": ("2.42"),
}
)
_V2_4_WARNING_MESSAGE = (
"After google-cloud-aiplatform>1.53.0, using Ray version = 2.4 will result"
" in an error. Please use Ray version = 2.33.0 or 2.42.0 (default) instead."
)
_V2_9_WARNING_MESSAGE = (
"In March 2025, using Ray version = 2.9 will result in an error. "
"Please use Ray version = 2.33.0 or 2.42.0 (default) instead."
)
# Artifact Repository available regions.
_AVAILABLE_REGIONS = ["us", "europe", "asia"]
# If region is not available, assume using the default region.
_DEFAULT_REGION = "us"
_PERSISTENT_RESOURCE_NAME_PATTERN = "projects/{}/locations/{}/persistentResources/{}"
_VALID_RESOURCE_NAME_REGEX = "[a-z][a-zA-Z0-9._-]{0,127}"
_DASHBOARD_URI_SUFFIX = "aiplatform-training.googleusercontent.com"
def valid_resource_name(resource_name):
"""Check if address is a valid resource name."""
resource_name_split = resource_name.split("/")
if not (
len(resource_name_split) == 6
and resource_name_split[0] == "projects"
and resource_name_split[2] == "locations"
and resource_name_split[4] == "persistentResources"
):
raise ValueError(
"[Ray on Vertex AI]: Address must be in the following "
"format: vertex_ray://projects/<project_num>/locations/<region>/persistentResources/<pr_id> "
"or vertex_ray://<pr_id>."
)
def maybe_reconstruct_resource_name(address) -> str:
"""Reconstruct full persistent resource name if only id was given."""
if re.match("^{}$".format(_VALID_RESOURCE_NAME_REGEX), address):
# Assume only cluster name (persistent resource id) was given.
logging.info(
"[Ray on Vertex AI]: Cluster name was given as address, reconstructing full resource name"
)
return _PERSISTENT_RESOURCE_NAME_PATTERN.format(
resource_manager_utils.get_project_number(
initializer.global_config.project
),
initializer.global_config.location,
address,
)
return address
def get_local_ray_version():
ray_version = ray.__version__.split(".")
if len(ray_version) == 3:
ray_version = ray_version[:2]
return ".".join(ray_version)
def get_image_uri(ray_version, python_version, enable_cuda):
"""Image uri for a given ray version and python version."""
if ray_version not in SUPPORTED_RAY_VERSIONS:
raise ValueError(
"[Ray on Vertex AI]: The supported Ray versions are %s (%s) and %s (%s)."
% (
list(SUPPORTED_RAY_VERSIONS.keys())[0],
list(SUPPORTED_RAY_VERSIONS.values())[0],
list(SUPPORTED_RAY_VERSIONS.keys())[1],
list(SUPPORTED_RAY_VERSIONS.values())[1],
)
)
if python_version not in SUPPORTED_RAY_VERSIONS_FROM_PYTHON_VERSIONS:
raise ValueError(
"[Ray on Vertex AI]: The supported Python versions are 3.10 or 3.11."
)
if ray_version not in SUPPORTED_RAY_VERSIONS_FROM_PYTHON_VERSIONS[python_version]:
raise ValueError(
"[Ray on Vertex AI]: The supported Ray version(s) for Python version %s: %s."
% (
python_version,
SUPPORTED_RAY_VERSIONS_FROM_PYTHON_VERSIONS[python_version],
)
)
location = initializer.global_config.location
region = location.split("-")[0]
if region not in _AVAILABLE_REGIONS:
region = _DEFAULT_REGION
ray_version = ray_version.replace(".", "-")
python_version = python_version.replace(".", "")
if enable_cuda:
return f"{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.{ray_version}.py{python_version}:latest"
else:
return f"{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.{ray_version}.py{python_version}:latest"
def get_versions_from_image_uri(image_uri):
"""Get ray version and python version from image uri."""
logging.info(f"[Ray on Vertex AI]: Getting versions from image uri: {image_uri}")
image_label = image_uri.split("/")[-1].split(":")[0]
py_version = image_label[-3] + "." + image_label[-2:]
ray_version = image_label.split(".")[1].replace("-", ".")
if (
py_version in SUPPORTED_RAY_VERSIONS_FROM_PYTHON_VERSIONS
and ray_version in SUPPORTED_RAY_VERSIONS_FROM_PYTHON_VERSIONS[py_version]
):
return py_version, ray_version
else:
# May not parse custom image and get the versions correctly
return None, None
def valid_dashboard_address(address):
"""Check if address is a valid dashboard uri."""
return address.endswith(_DASHBOARD_URI_SUFFIX)
def get_bearer_token():
"""Get bearer token through Application Default Credentials."""
creds, _ = google.auth.default(
scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
# creds.valid is False, and creds.token is None
# Need to refresh credentials to populate those
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
return creds.token

View File

@@ -0,0 +1,217 @@
# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import dataclasses
from typing import Dict, List, Optional
from google.cloud.aiplatform_v1beta1.types import PersistentResource
@dataclasses.dataclass
class AutoscalingSpec:
"""Autoscaling spec for a ray cluster node.
Attributes:
min_replica_count: The minimum number of replicas in the cluster.
max_replica_count: The maximum number of replicas in the cluster.
"""
min_replica_count: int = 1
max_replica_count: int = 2
@dataclasses.dataclass
class Resources:
"""Resources for a ray cluster node.
Attributes:
machine_type: See the list of machine types:
https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types
node_count: This argument represents how many nodes to start for the
ray cluster.
accelerator_type: e.g. "NVIDIA_TESLA_P4".
Vertex AI supports the following types of GPU:
https://cloud.google.com/vertex-ai/docs/training/configure-compute#specifying_gpus
accelerator_count: The number of accelerators to attach to the machine.
boot_disk_type: Type of the boot disk (default is "pd-ssd").
Valid values: "pd-ssd" (Persistent Disk Solid State Drive) or
"pd-standard" (Persistent Disk Hard Disk Drive).
boot_disk_size_gb: Size in GB of the boot disk (default is 100GB). Must
be either unspecified or within the range of [100, 64000].
custom_image: Custom image for this resource (e.g.
us-docker.pkg.dev/my-project/ray-gpu.2-9.py310-tf:latest).
autoscaling_spec: Autoscaling spec for this resource.
"""
machine_type: Optional[str] = "n1-standard-16"
node_count: Optional[int] = 1
accelerator_type: Optional[str] = None
accelerator_count: Optional[int] = 0
boot_disk_type: Optional[str] = "pd-ssd"
boot_disk_size_gb: Optional[int] = 100
custom_image: Optional[str] = None
autoscaling_spec: Optional[AutoscalingSpec] = None
@dataclasses.dataclass
class NodeImages:
"""Custom images for a ray cluster.
We currently support Ray v2.9, v2.33, v2.42 and python v3.10.
We also support python v3.11 for Ray v2.42.
The custom images must be extended from the following base images:
"{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-9.py310:latest",
"{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-9.py310:latest",
"{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-33.py310:latest",
"{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-33.py310:latest",
"{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-42.py310:latest",
"{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-42.py310:latest",
"{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-42.py311:latest", or
"{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-42.py311:latest". In
order to use custom images, need to specify both head and worker images.
Attributes:
head: image for head node (eg. us-docker.pkg.dev/my-project/ray-cpu.2-33.py310-tf:latest).
worker: image for all worker nodes (eg. us-docker.pkg.dev/my-project/ray-gpu.2-33.py310-tf:latest).
"""
head: str = None
worker: str = None
@dataclasses.dataclass
class PscIConfig:
"""PSC-I config.
Attributes:
network_attachment: Optional. The name or full name of the Compute Engine
`network attachment <https://cloud.google.com/vpc/docs/about-network-attachments>`
to attach to the resource. It has a format:
``projects/{project}/regions/{region}/networkAttachments/{networkAttachment}``.
Where {project} is a project number, as in ``12345``, and
{networkAttachment} is a network attachment name. To specify
this field, you must have already [created a network
attachment]
(https://cloud.google.com/vpc/docs/create-manage-network-attachments#create-network-attachments).
This field is only used for resources using PSC-I. Make sure you do not
specify the network here for VPC peering.
"""
network_attachment: str = None
@dataclasses.dataclass
class NfsMount:
"""NFS mount.
Attributes:
server: Required. IP address of the NFS server.
path: Required. Source path exported from NFS server. Has to start
with '/', and combined with the ip address, it indicates the
source mount path in the form of ``server:path``.
mount_point: Required. Destination mount path. The NFS will be mounted
for the user under /mnt/nfs/<mount_point>.
"""
server: str = None
path: str = None
mount_point: str = None
@dataclasses.dataclass
class Cluster:
"""Ray cluster (output only).
Attributes:
cluster_resource_name: It has a format:
"projects/<project_num>/locations/<region>/persistentResources/<pr_id>".
network: Virtual private cloud (VPC) network. It has a format:
"projects/<project_num>/global/networks/<network_name>".
For Ray Client, VPC peering is required to connect to the cluster
managed in the Vertex API service. For Ray Job API, VPC network is
not required because cluster connection can be accessed through
dashboard address.
reserved_ip_ranges: A list of names for the reserved IP ranges under
the VPC network that can be used for this cluster. If set, we will
deploy the cluster within the provided IP ranges. Otherwise, the
cluster is deployed to any IP ranges under the provided VPC network.
Example: ["vertex-ai-ip-range"].
service_account: Service account to be used for running Ray programs on
the cluster.
state: Describes the cluster state (defined in PersistentResource.State).
python_version: Python version for the ray cluster (e.g. "3.10").
ray_version: Ray version for the ray cluster (e.g. "2.33").
head_node_type: The head node resource. Resources.node_count must be 1.
If not set, by default it is a CPU node with machine_type of n1-standard-8.
worker_node_types: The list of Resources of the worker nodes. Should not
duplicate the elements in the list.
dashboard_address: For Ray Job API (JobSubmissionClient), with this
cluster connection doesn't require VPC peering.
labels:
The labels with user-defined metadata to organize Ray cluster.
Label keys and values can be no longer than 64 characters (Unicode
codepoints), can only contain lowercase letters, numeric characters,
underscores and dashes. International characters are allowed.
See https://goo.gl/xmQnxf for more information and examples of labels.
"""
cluster_resource_name: str = None
network: str = None
reserved_ip_ranges: List[str] = None
service_account: str = None
state: PersistentResource.State = None
python_version: str = None
ray_version: str = None
head_node_type: Resources = None
worker_node_types: List[Resources] = None
dashboard_address: str = None
ray_metric_enabled: bool = True
ray_logs_enabled: bool = True
psc_interface_config: PscIConfig = None
labels: Dict[str, str] = None
def _check_machine_spec_identical(
node_type_1: Resources,
node_type_2: Resources,
) -> int:
"""Check if node_type_1 and node_type_2 have the same machine_spec.
If they are identical, return additional_replica_count."""
additional_replica_count = 0
# Check if machine_spec are the same
if (
node_type_1.machine_type == node_type_2.machine_type
and node_type_1.accelerator_type == node_type_2.accelerator_type
and node_type_1.accelerator_count == node_type_2.accelerator_count
):
if node_type_1.boot_disk_type != node_type_2.boot_disk_type:
raise ValueError(
"Worker disk type must match the head node's disk type if"
" sharing the same machine_type, accelerator_type, and"
" accelerator_count"
)
if node_type_1.boot_disk_size_gb != node_type_2.boot_disk_size_gb:
raise ValueError(
"Worker disk size must match the head node's disk size if"
" sharing the same machine_type, accelerator_type, and"
" accelerator_count"
)
additional_replica_count = node_type_2.node_count
return additional_replica_count
return additional_replica_count