evo-ai/.venv/lib/python3.10/site-packages/vertex_ray/util/resources.py

# -*- coding: utf-8 -*-

# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import dataclasses
from typing import Dict, List, Optional
from google.cloud.aiplatform_v1beta1.types import PersistentResource


@dataclasses.dataclass
class AutoscalingSpec:
    """Autoscaling spec for a ray cluster node.

    Attributes:
        min_replica_count: The minimum number of replicas in the cluster.
        max_replica_count: The maximum number of replicas in the cluster.
    """

    min_replica_count: int = 1
    max_replica_count: int = 2


@dataclasses.dataclass
class Resources:
    """Resources for a ray cluster node.

    Attributes:
        machine_type: See the list of machine types:
            https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types
        node_count: This argument represents how many nodes to start for the
            ray cluster.
        accelerator_type: e.g. "NVIDIA_TESLA_P4".
            Vertex AI supports the following types of GPU:
            https://cloud.google.com/vertex-ai/docs/training/configure-compute#specifying_gpus
        accelerator_count: The number of accelerators to attach to the machine.
        boot_disk_type: Type of the boot disk (default is "pd-ssd").
            Valid values: "pd-ssd" (Persistent Disk Solid State Drive) or
            "pd-standard" (Persistent Disk Hard Disk Drive).
        boot_disk_size_gb: Size in GB of the boot disk (default is 100GB). Must
            be either unspecified or within the range of [100, 64000].
        custom_image: Custom image for this resource (e.g.
            us-docker.pkg.dev/my-project/ray-gpu.2-9.py310-tf:latest).
        autoscaling_spec: Autoscaling spec for this resource.
    """

    machine_type: Optional[str] = "n1-standard-16"
    node_count: Optional[int] = 1
    accelerator_type: Optional[str] = None
    accelerator_count: Optional[int] = 0
    boot_disk_type: Optional[str] = "pd-ssd"
    boot_disk_size_gb: Optional[int] = 100
    custom_image: Optional[str] = None
    autoscaling_spec: Optional[AutoscalingSpec] = None


@dataclasses.dataclass
class NodeImages:
    """Custom images for a ray cluster.

    We currently support Ray v2.9, v2.33, v2.42 and python v3.10.
    We also support python v3.11 for Ray v2.42.
    The custom images must be extended from the following base images:
    "{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-9.py310:latest",
    "{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-9.py310:latest",
    "{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-33.py310:latest",
    "{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-33.py310:latest",
    "{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-42.py310:latest",
    "{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-42.py310:latest",
    "{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-42.py311:latest", or
    "{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-42.py311:latest". In
    order to use custom images, need to specify both head and worker images.

    Attributes:
        head: image for head node (eg. us-docker.pkg.dev/my-project/ray-cpu.2-33.py310-tf:latest).
        worker: image for all worker nodes (eg. us-docker.pkg.dev/my-project/ray-gpu.2-33.py310-tf:latest).
    """

    head: str = None
    worker: str = None


@dataclasses.dataclass
class PscIConfig:
    """PSC-I config.

    Attributes:
        network_attachment: Optional. The name or full name of the Compute Engine
            `network attachment <https://cloud.google.com/vpc/docs/about-network-attachments>`
            to attach to the resource. It has a format:
            ``projects/{project}/regions/{region}/networkAttachments/{networkAttachment}``.
            Where {project} is a project number, as in ``12345``, and
            {networkAttachment} is a network attachment name. To specify
            this field, you must have already [created a network
            attachment]
            (https://cloud.google.com/vpc/docs/create-manage-network-attachments#create-network-attachments).
            This field is only used for resources using PSC-I. Make sure you do not
            specify the network here for VPC peering.
    """

    network_attachment: str = None


@dataclasses.dataclass
class NfsMount:
    """NFS mount.

    Attributes:
        server: Required. IP address of the NFS server.
        path: Required. Source path exported from NFS server. Has to start
            with '/', and combined with the ip address, it indicates the
            source mount path in the form of ``server:path``.
        mount_point: Required. Destination mount path. The NFS will be mounted
            for the user under /mnt/nfs/<mount_point>.
    """

    server: str = None
    path: str = None
    mount_point: str = None


@dataclasses.dataclass
class Cluster:
    """Ray cluster (output only).

    Attributes:
        cluster_resource_name: It has a format:
            "projects/<project_num>/locations/<region>/persistentResources/<pr_id>".
        network: Virtual private cloud (VPC) network. It has a format:
            "projects/<project_num>/global/networks/<network_name>".
            For Ray Client, VPC peering is required to connect to the cluster
            managed in the Vertex API service. For Ray Job API, VPC network is
            not required because cluster connection can be accessed through
            dashboard address.
        reserved_ip_ranges: A list of names for the reserved IP ranges under
            the VPC network that can be used for this cluster. If set, we will
            deploy the cluster within the provided IP ranges. Otherwise, the
            cluster is deployed to any IP ranges under the provided VPC network.
            Example: ["vertex-ai-ip-range"].
        service_account: Service account to be used for running Ray programs on
            the cluster.
        state: Describes the cluster state (defined in PersistentResource.State).
        python_version: Python version for the ray cluster (e.g. "3.10").
        ray_version: Ray version for the ray cluster (e.g. "2.33").
        head_node_type: The head node resource. Resources.node_count must be 1.
            If not set, by default it is a CPU node with machine_type of n1-standard-8.
        worker_node_types: The list of Resources of the worker nodes. Should not
            duplicate the elements in the list.
        dashboard_address: For Ray Job API (JobSubmissionClient), with this
           cluster connection doesn't require VPC peering.
        labels:
            The labels with user-defined metadata to organize Ray cluster.

            Label keys and values can be no longer than 64 characters (Unicode
            codepoints), can only contain lowercase letters, numeric characters,
            underscores and dashes. International characters are allowed.

            See https://goo.gl/xmQnxf for more information and examples of labels.
    """

    cluster_resource_name: str = None
    network: str = None
    reserved_ip_ranges: List[str] = None
    service_account: str = None
    state: PersistentResource.State = None
    python_version: str = None
    ray_version: str = None
    head_node_type: Resources = None
    worker_node_types: List[Resources] = None
    dashboard_address: str = None
    ray_metric_enabled: bool = True
    ray_logs_enabled: bool = True
    psc_interface_config: PscIConfig = None
    labels: Dict[str, str] = None


def _check_machine_spec_identical(
    node_type_1: Resources,
    node_type_2: Resources,
) -> int:
    """Check if node_type_1 and node_type_2 have the same machine_spec.
    If they are identical, return additional_replica_count."""
    additional_replica_count = 0

    # Check if machine_spec are the same
    if (
        node_type_1.machine_type == node_type_2.machine_type
        and node_type_1.accelerator_type == node_type_2.accelerator_type
        and node_type_1.accelerator_count == node_type_2.accelerator_count
    ):
        if node_type_1.boot_disk_type != node_type_2.boot_disk_type:
            raise ValueError(
                "Worker disk type must match the head node's disk type if"
                " sharing the same machine_type, accelerator_type, and"
                " accelerator_count"
            )
        if node_type_1.boot_disk_size_gb != node_type_2.boot_disk_size_gb:
            raise ValueError(
                "Worker disk size must match the head node's disk size if"
                " sharing the same machine_type, accelerator_type, and"
                " accelerator_count"
            )
        additional_replica_count = node_type_2.node_count
        return additional_replica_count

    return additional_replica_count