Files
evo-ai/.venv/lib/python3.10/site-packages/vertex_ray/util/resources.py
2025-04-25 15:30:54 -03:00

218 lines
9.0 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import dataclasses
from typing import Dict, List, Optional
from google.cloud.aiplatform_v1beta1.types import PersistentResource
@dataclasses.dataclass
class AutoscalingSpec:
"""Autoscaling spec for a ray cluster node.
Attributes:
min_replica_count: The minimum number of replicas in the cluster.
max_replica_count: The maximum number of replicas in the cluster.
"""
min_replica_count: int = 1
max_replica_count: int = 2
@dataclasses.dataclass
class Resources:
"""Resources for a ray cluster node.
Attributes:
machine_type: See the list of machine types:
https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types
node_count: This argument represents how many nodes to start for the
ray cluster.
accelerator_type: e.g. "NVIDIA_TESLA_P4".
Vertex AI supports the following types of GPU:
https://cloud.google.com/vertex-ai/docs/training/configure-compute#specifying_gpus
accelerator_count: The number of accelerators to attach to the machine.
boot_disk_type: Type of the boot disk (default is "pd-ssd").
Valid values: "pd-ssd" (Persistent Disk Solid State Drive) or
"pd-standard" (Persistent Disk Hard Disk Drive).
boot_disk_size_gb: Size in GB of the boot disk (default is 100GB). Must
be either unspecified or within the range of [100, 64000].
custom_image: Custom image for this resource (e.g.
us-docker.pkg.dev/my-project/ray-gpu.2-9.py310-tf:latest).
autoscaling_spec: Autoscaling spec for this resource.
"""
machine_type: Optional[str] = "n1-standard-16"
node_count: Optional[int] = 1
accelerator_type: Optional[str] = None
accelerator_count: Optional[int] = 0
boot_disk_type: Optional[str] = "pd-ssd"
boot_disk_size_gb: Optional[int] = 100
custom_image: Optional[str] = None
autoscaling_spec: Optional[AutoscalingSpec] = None
@dataclasses.dataclass
class NodeImages:
"""Custom images for a ray cluster.
We currently support Ray v2.9, v2.33, v2.42 and python v3.10.
We also support python v3.11 for Ray v2.42.
The custom images must be extended from the following base images:
"{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-9.py310:latest",
"{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-9.py310:latest",
"{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-33.py310:latest",
"{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-33.py310:latest",
"{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-42.py310:latest",
"{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-42.py310:latest",
"{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-42.py311:latest", or
"{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-42.py311:latest". In
order to use custom images, need to specify both head and worker images.
Attributes:
head: image for head node (eg. us-docker.pkg.dev/my-project/ray-cpu.2-33.py310-tf:latest).
worker: image for all worker nodes (eg. us-docker.pkg.dev/my-project/ray-gpu.2-33.py310-tf:latest).
"""
head: str = None
worker: str = None
@dataclasses.dataclass
class PscIConfig:
"""PSC-I config.
Attributes:
network_attachment: Optional. The name or full name of the Compute Engine
`network attachment <https://cloud.google.com/vpc/docs/about-network-attachments>`
to attach to the resource. It has a format:
``projects/{project}/regions/{region}/networkAttachments/{networkAttachment}``.
Where {project} is a project number, as in ``12345``, and
{networkAttachment} is a network attachment name. To specify
this field, you must have already [created a network
attachment]
(https://cloud.google.com/vpc/docs/create-manage-network-attachments#create-network-attachments).
This field is only used for resources using PSC-I. Make sure you do not
specify the network here for VPC peering.
"""
network_attachment: str = None
@dataclasses.dataclass
class NfsMount:
"""NFS mount.
Attributes:
server: Required. IP address of the NFS server.
path: Required. Source path exported from NFS server. Has to start
with '/', and combined with the ip address, it indicates the
source mount path in the form of ``server:path``.
mount_point: Required. Destination mount path. The NFS will be mounted
for the user under /mnt/nfs/<mount_point>.
"""
server: str = None
path: str = None
mount_point: str = None
@dataclasses.dataclass
class Cluster:
"""Ray cluster (output only).
Attributes:
cluster_resource_name: It has a format:
"projects/<project_num>/locations/<region>/persistentResources/<pr_id>".
network: Virtual private cloud (VPC) network. It has a format:
"projects/<project_num>/global/networks/<network_name>".
For Ray Client, VPC peering is required to connect to the cluster
managed in the Vertex API service. For Ray Job API, VPC network is
not required because cluster connection can be accessed through
dashboard address.
reserved_ip_ranges: A list of names for the reserved IP ranges under
the VPC network that can be used for this cluster. If set, we will
deploy the cluster within the provided IP ranges. Otherwise, the
cluster is deployed to any IP ranges under the provided VPC network.
Example: ["vertex-ai-ip-range"].
service_account: Service account to be used for running Ray programs on
the cluster.
state: Describes the cluster state (defined in PersistentResource.State).
python_version: Python version for the ray cluster (e.g. "3.10").
ray_version: Ray version for the ray cluster (e.g. "2.33").
head_node_type: The head node resource. Resources.node_count must be 1.
If not set, by default it is a CPU node with machine_type of n1-standard-8.
worker_node_types: The list of Resources of the worker nodes. Should not
duplicate the elements in the list.
dashboard_address: For Ray Job API (JobSubmissionClient), with this
cluster connection doesn't require VPC peering.
labels:
The labels with user-defined metadata to organize Ray cluster.
Label keys and values can be no longer than 64 characters (Unicode
codepoints), can only contain lowercase letters, numeric characters,
underscores and dashes. International characters are allowed.
See https://goo.gl/xmQnxf for more information and examples of labels.
"""
cluster_resource_name: str = None
network: str = None
reserved_ip_ranges: List[str] = None
service_account: str = None
state: PersistentResource.State = None
python_version: str = None
ray_version: str = None
head_node_type: Resources = None
worker_node_types: List[Resources] = None
dashboard_address: str = None
ray_metric_enabled: bool = True
ray_logs_enabled: bool = True
psc_interface_config: PscIConfig = None
labels: Dict[str, str] = None
def _check_machine_spec_identical(
node_type_1: Resources,
node_type_2: Resources,
) -> int:
"""Check if node_type_1 and node_type_2 have the same machine_spec.
If they are identical, return additional_replica_count."""
additional_replica_count = 0
# Check if machine_spec are the same
if (
node_type_1.machine_type == node_type_2.machine_type
and node_type_1.accelerator_type == node_type_2.accelerator_type
and node_type_1.accelerator_count == node_type_2.accelerator_count
):
if node_type_1.boot_disk_type != node_type_2.boot_disk_type:
raise ValueError(
"Worker disk type must match the head node's disk type if"
" sharing the same machine_type, accelerator_type, and"
" accelerator_count"
)
if node_type_1.boot_disk_size_gb != node_type_2.boot_disk_size_gb:
raise ValueError(
"Worker disk size must match the head node's disk size if"
" sharing the same machine_type, accelerator_type, and"
" accelerator_count"
)
additional_replica_count = node_type_2.node_count
return additional_replica_count
return additional_replica_count