structure saas with tools

This commit is contained in:
Davidson Gomes
2025-04-25 15:30:54 -03:00
commit 1aef473937
16434 changed files with 6584257 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,282 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Dict, List, Union
from google.cloud.aiplatform import base
from google.cloud.aiplatform.compat.types import (
featurestore_online_service_v1beta1 as gca_featurestore_online_service_v1beta1,
)
from google.cloud.aiplatform.compat.types import (
types_v1beta1 as gca_types_v1beta1,
)
from google.cloud.aiplatform.featurestore import _entity_type
_LOGGER = base.Logger(__name__)
class EntityType(_entity_type._EntityType):
"""Preview EntityType resource for Vertex AI."""
# TODO(b/262275273): Remove preview v1beta1 implementation of `write_feature_values`
# when GA implementation can write multiple payloads per request. Currently, GA
# supports one payload per request.
def write_feature_values(
self,
instances: Union[
List[gca_featurestore_online_service_v1beta1.WriteFeatureValuesPayload],
Dict[
str,
Dict[
str,
Union[
int,
str,
float,
bool,
bytes,
List[int],
List[str],
List[float],
List[bool],
],
],
],
"pd.DataFrame", # type: ignore # noqa: F821 - skip check for undefined name 'pd'
],
) -> "EntityType":
"""Streaming ingestion. Write feature values directly to Feature Store.
```
my_entity_type = aiplatform.EntityType(
entity_type_name="my_entity_type_id",
featurestore_id="my_featurestore_id",
)
# writing feature values from a pandas DataFrame
my_dataframe = pd.DataFrame(
data = [
{"entity_id": "movie_01", "average_rating": 4.9},
{"entity_id": "movie_02", "average_rating": 4.5},
],
columns=["entity_id", "average_rating"],
)
my_dataframe = my_df.set_index("entity_id")
my_entity_type.preview.write_feature_values(
instances=my_df
)
# writing feature values from a Python dict
my_data_dict = {
"movie_03" : {"average_rating": 3.7},
"movie_04" : {"average_rating": 2.5},
}
my_entity_type.preview.write_feature_values(
instances=my_data_dict
)
# writing feature values from a list of WriteFeatureValuesPayload objects
payloads = [
gca_featurestore_online_service_v1beta1.WriteFeatureValuesPayload(
entity_id="movie_05",
feature_values=gca_featurestore_online_service_v1beta1.FeatureValue(
double_value=4.9
)
)
]
my_entity_type.preview.write_feature_values(
instances=payloads
)
# reading back written feature values
my_entity_type.read(
entity_ids=["movie_01", "movie_02", "movie_03", "movie_04", "movie_05"]
)
```
Args:
instances (
Union[
List[gca_featurestore_online_service_v1beta1.WriteFeatureValuesPayload],
Dict[str, Dict[str, Union[int, str, float, bool, bytes,
List[int], List[str], List[float], List[bool]]]],
pd.Dataframe]):
Required. Feature values to be written to the Feature Store that
can take the form of a list of WriteFeatureValuesPayload objects,
a Python dict of the form {entity_id : {feature_id : feature_value}, ...},
or a pandas Dataframe, where the index column holds the unique entity
ID strings and each remaining column represents a feature. Each row
in the pandas Dataframe represents an entity, which has an entity ID
and its associated feature values.
Returns:
EntityType - The updated EntityType object.
"""
if isinstance(instances, Dict):
payloads = self._generate_payloads(instances=instances)
elif isinstance(instances, List):
payloads = instances
else:
instances_dict = instances.to_dict(orient="index")
payloads = self._generate_payloads(instances=instances_dict)
_LOGGER.log_action_start_against_resource(
"Writing",
"feature values",
self,
)
self._featurestore_online_client.select_version("v1beta1").write_feature_values(
entity_type=self.resource_name, payloads=payloads
)
_LOGGER.log_action_completed_against_resource("feature values", "written", self)
return self
@classmethod
def _generate_payloads(
cls,
instances: Dict[
str,
Dict[
str,
Union[
int,
str,
float,
bool,
bytes,
List[int],
List[str],
List[float],
List[bool],
],
],
],
) -> List[gca_featurestore_online_service_v1beta1.WriteFeatureValuesPayload]:
"""Helper method used to generate GAPIC WriteFeatureValuesPayloads from
a Python dict.
Args:
instances (Dict[str, Dict[str, Union[int, str, float, bool, bytes,
List[int], List[str], List[float], List[bool]]]]):
Required. Dict mapping entity IDs to their corresponding features.
Returns:
List[gca_featurestore_online_service_v1beta1.WriteFeatureValuesPayload] -
A list of WriteFeatureValuesPayload objects ready to be written to the Feature Store.
"""
payloads = []
for entity_id, features in instances.items():
feature_values = {}
for feature_id, value in features.items():
feature_value = cls._convert_value_to_gapic_feature_value(
feature_id=feature_id, value=value
)
feature_values[feature_id] = feature_value
payload = gca_featurestore_online_service_v1beta1.WriteFeatureValuesPayload(
entity_id=entity_id, feature_values=feature_values
)
payloads.append(payload)
return payloads
@classmethod
def _convert_value_to_gapic_feature_value(
cls,
feature_id: str,
value: Union[
int, str, float, bool, bytes, List[int], List[str], List[float], List[bool]
],
) -> gca_featurestore_online_service_v1beta1.FeatureValue:
"""Helper method that converts a Python literal value or a list of
literals to a GAPIC FeatureValue.
Args:
feature_id (str):
Required. Name of a feature.
value (Union[int, str, float, bool, bytes,
List[int], List[str], List[float], List[bool]]]):
Required. Python literal value or list of Python literals to
be converted to a GAPIC FeatureValue.
Returns:
gca_featurestore_online_service_v1beta1.FeatureValue - GAPIC object
that represents the value of a feature.
Raises:
ValueError if a list has values that are not all of the same type.
ValueError if feature type is not supported.
"""
if isinstance(value, bool):
feature_value = gca_featurestore_online_service_v1beta1.FeatureValue(
bool_value=value
)
elif isinstance(value, str):
feature_value = gca_featurestore_online_service_v1beta1.FeatureValue(
string_value=value
)
elif isinstance(value, int):
feature_value = gca_featurestore_online_service_v1beta1.FeatureValue(
int64_value=value
)
elif isinstance(value, float):
feature_value = gca_featurestore_online_service_v1beta1.FeatureValue(
double_value=value
)
elif isinstance(value, bytes):
feature_value = gca_featurestore_online_service_v1beta1.FeatureValue(
bytes_value=value
)
elif isinstance(value, List):
if all([isinstance(item, bool) for item in value]):
feature_value = gca_featurestore_online_service_v1beta1.FeatureValue(
bool_array_value=gca_types_v1beta1.BoolArray(values=value)
)
elif all([isinstance(item, str) for item in value]):
feature_value = gca_featurestore_online_service_v1beta1.FeatureValue(
string_array_value=gca_types_v1beta1.StringArray(values=value)
)
elif all([isinstance(item, int) for item in value]):
feature_value = gca_featurestore_online_service_v1beta1.FeatureValue(
int64_array_value=gca_types_v1beta1.Int64Array(values=value)
)
elif all([isinstance(item, float) for item in value]):
feature_value = gca_featurestore_online_service_v1beta1.FeatureValue(
double_array_value=gca_types_v1beta1.DoubleArray(values=value)
)
else:
raise ValueError(
f"Cannot infer feature value for feature {feature_id} with "
f"value {value}! Please ensure every value in the list "
f"is the same type (either int, str, float, bool)."
)
else:
raise ValueError(
f"Cannot infer feature value for feature {feature_id} with "
f"value {value}! {type(value)} type is not supported. "
f"Please ensure value type is an int, str, float, bool, "
f"bytes, or a list of int, str, float, bool."
)
return feature_value

View File

@@ -0,0 +1,869 @@
# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Dict, List, Optional, Union
import copy
import uuid
from google.api_core import retry
from google.auth import credentials as auth_credentials
from google.cloud import aiplatform
from google.cloud.aiplatform import base
from google.cloud.aiplatform import compat
from google.cloud.aiplatform import initializer
from google.cloud.aiplatform import jobs
from google.cloud.aiplatform import utils
from google.cloud.aiplatform.compat.types import (
custom_job_v1beta1 as gca_custom_job_compat,
hyperparameter_tuning_job_v1beta1 as gca_hyperparameter_tuning_job_compat,
job_state as gca_job_state,
job_state_v1beta1 as gca_job_state_v1beta1,
study_v1beta1,
)
from google.cloud.aiplatform.compat.types import (
execution_v1beta1 as gcs_execution_compat,
)
from google.cloud.aiplatform.compat.types import io_v1beta1 as gca_io_compat
from google.cloud.aiplatform.metadata import constants as metadata_constants
from google.cloud.aiplatform import hyperparameter_tuning
from google.cloud.aiplatform.utils import console_utils
import proto
from google.protobuf import duration_pb2 # type: ignore
_LOGGER = base.Logger(__name__)
_DEFAULT_RETRY = retry.Retry()
# TODO(b/242108750): remove temporary logic once model monitoring for batch prediction is GA
_JOB_COMPLETE_STATES = (
gca_job_state.JobState.JOB_STATE_SUCCEEDED,
gca_job_state.JobState.JOB_STATE_FAILED,
gca_job_state.JobState.JOB_STATE_CANCELLED,
gca_job_state.JobState.JOB_STATE_PAUSED,
gca_job_state_v1beta1.JobState.JOB_STATE_SUCCEEDED,
gca_job_state_v1beta1.JobState.JOB_STATE_FAILED,
gca_job_state_v1beta1.JobState.JOB_STATE_CANCELLED,
gca_job_state_v1beta1.JobState.JOB_STATE_PAUSED,
)
_JOB_ERROR_STATES = (
gca_job_state.JobState.JOB_STATE_FAILED,
gca_job_state.JobState.JOB_STATE_CANCELLED,
gca_job_state_v1beta1.JobState.JOB_STATE_FAILED,
gca_job_state_v1beta1.JobState.JOB_STATE_CANCELLED,
)
class CustomJob(jobs.CustomJob):
"""Deprecated. Vertex AI Custom Job (preview)."""
def __init__(
self,
# TODO(b/223262536): Make display_name parameter fully optional in next major release
display_name: str,
worker_pool_specs: Union[
List[Dict], List[gca_custom_job_compat.WorkerPoolSpec]
],
base_output_dir: Optional[str] = None,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
labels: Optional[Dict[str, str]] = None,
encryption_spec_key_name: Optional[str] = None,
staging_bucket: Optional[str] = None,
persistent_resource_id: Optional[str] = None,
):
"""Deprecated. Please use the GA (non-preview) version of this class.
Constructs a Custom Job with Worker Pool Specs.
```
Example usage:
worker_pool_specs = [
{
"machine_spec": {
"machine_type": "n1-standard-4",
"accelerator_type": "NVIDIA_TESLA_K80",
"accelerator_count": 1,
},
"replica_count": 1,
"container_spec": {
"image_uri": container_image_uri,
"command": [],
"args": [],
},
}
]
my_job = aiplatform.preview.jobs.CustomJob(
display_name='my_job',
worker_pool_specs=worker_pool_specs,
labels={'my_key': 'my_value'},
)
my_job.run()
```
For more information on configuring worker pool specs please visit:
https://cloud.google.com/ai-platform-unified/docs/training/create-custom-job
Args:
display_name (str):
Required. The user-defined name of the HyperparameterTuningJob.
The name can be up to 128 characters long and can be consist
of any UTF-8 characters.
worker_pool_specs (Union[List[Dict], List[aiplatform.gapic.WorkerPoolSpec]]):
Required. The spec of the worker pools including machine type and Docker image.
Can provided as a list of dictionaries or list of WorkerPoolSpec proto messages.
base_output_dir (str):
Optional. GCS output directory of job. If not provided a
timestamped directory in the staging directory will be used.
project (str):
Optional.Project to run the custom job in. Overrides project set in aiplatform.init.
location (str):
Optional.Location to run the custom job in. Overrides location set in aiplatform.init.
credentials (auth_credentials.Credentials):
Optional.Custom credentials to use to run call custom job service. Overrides
credentials set in aiplatform.init.
labels (Dict[str, str]):
Optional. The labels with user-defined metadata to
organize CustomJobs.
Label keys and values can be no longer than 64
characters (Unicode codepoints), can only
contain lowercase letters, numeric characters,
underscores and dashes. International characters
are allowed.
See https://goo.gl/xmQnxf for more information
and examples of labels.
encryption_spec_key_name (str):
Optional.Customer-managed encryption key name for a
CustomJob. If this is set, then all resources
created by the CustomJob will be encrypted with
the provided encryption key.
staging_bucket (str):
Optional. Bucket for produced custom job artifacts. Overrides
staging_bucket set in aiplatform.init.
persistent_resource_id (str):
Optional. The ID of the PersistentResource in the same Project
and Location. If this is specified, the job will be run on
existing machines held by the PersistentResource instead of
on-demand short-live machines. The network and CMEK configs on
the job should be consistent with those on the PersistentResource,
otherwise, the job will be rejected.
Raises:
RuntimeError: If staging bucket was not set using aiplatform.init
and a staging bucket was not passed in.
"""
super().__init__(
display_name=display_name,
worker_pool_specs=worker_pool_specs,
base_output_dir=base_output_dir,
project=project,
location=location,
credentials=credentials,
labels=labels,
encryption_spec_key_name=encryption_spec_key_name,
staging_bucket=staging_bucket,
)
staging_bucket = staging_bucket or initializer.global_config.staging_bucket
if not staging_bucket:
raise RuntimeError(
"staging_bucket should be passed to CustomJob constructor or "
"should be set using aiplatform.init(staging_bucket='gs://my-bucket')"
)
if labels:
utils.validate_labels(labels)
# default directory if not given
base_output_dir = base_output_dir or utils._timestamped_gcs_dir(
staging_bucket, "aiplatform-custom-job"
)
if not display_name:
display_name = self.__class__._generate_display_name()
self._gca_resource = gca_custom_job_compat.CustomJob(
display_name=display_name,
job_spec=gca_custom_job_compat.CustomJobSpec(
worker_pool_specs=worker_pool_specs,
base_output_directory=gca_io_compat.GcsDestination(
output_uri_prefix=base_output_dir
),
persistent_resource_id=persistent_resource_id,
),
labels=labels,
encryption_spec=initializer.global_config.get_encryption_spec(
encryption_spec_key_name=encryption_spec_key_name,
select_version=compat.V1BETA1,
),
)
self._experiment = None
self._experiment_run = None
self._enable_autolog = False
def _get_gca_resource(
self,
resource_name: str,
parent_resource_name_fields: Optional[Dict[str, str]] = None,
) -> proto.Message:
"""Returns GAPIC service representation of client class resource.
Args:
resource_name (str): Required. A fully-qualified resource name or ID.
parent_resource_name_fields (Dict[str,str]):
Optional. Mapping of parent resource name key to values. These
will be used to compose the resource name if only resource ID is given.
Should not include project and location.
"""
resource_name = utils.full_resource_name(
resource_name=resource_name,
resource_noun=self._resource_noun,
parse_resource_name_method=self._parse_resource_name,
format_resource_name_method=self._format_resource_name,
project=self.project,
location=self.location,
parent_resource_name_fields=parent_resource_name_fields,
resource_id_validator=self._resource_id_validator,
)
return getattr(self.api_client.select_version("v1beta1"), self._getter_method)(
name=resource_name, retry=_DEFAULT_RETRY
)
def submit(
self,
*,
service_account: Optional[str] = None,
network: Optional[str] = None,
timeout: Optional[int] = None,
restart_job_on_worker_restart: bool = False,
enable_web_access: bool = False,
experiment: Optional[Union["aiplatform.Experiment", str]] = None,
experiment_run: Optional[Union["aiplatform.ExperimentRun", str]] = None,
tensorboard: Optional[str] = None,
create_request_timeout: Optional[float] = None,
disable_retries: bool = False,
max_wait_duration: Optional[int] = None,
) -> None:
"""Submit the configured CustomJob.
Args:
service_account (str):
Optional. Specifies the service account for workload run-as account.
Users submitting jobs must have act-as permission on this run-as account.
network (str):
Optional. The full name of the Compute Engine network to which the job
should be peered. For example, projects/12345/global/networks/myVPC.
Private services access must already be configured for the network.
timeout (int):
The maximum job running time in seconds. The default is 7 days.
restart_job_on_worker_restart (bool):
Restarts the entire CustomJob if a worker
gets restarted. This feature can be used by
distributed training jobs that are not resilient
to workers leaving and joining a job.
enable_web_access (bool):
Whether you want Vertex AI to enable interactive shell access
to training containers.
https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell
experiment (Union[aiplatform.Experiment, str]):
Optional. The instance or name of an Experiment resource to which
this CustomJob will upload training parameters and metrics.
`service_account` is required with provided `experiment`.
For more information on configuring your service account please visit:
https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
experiment_run (Union[aiplatform.ExperimentRun, str]):
Optional. The instance or name of an ExperimentRun resource to which
this CustomJob will upload training parameters and metrics.
This arg can only be set when `experiment` is set. If 'experiment'
is set but 'experiment_run` is not, an ExperimentRun resource
will still be auto-generated.
tensorboard (str):
Optional. The name of a Vertex AI
[Tensorboard][google.cloud.aiplatform.v1beta1.Tensorboard]
resource to which this CustomJob will upload Tensorboard
logs. Format:
``projects/{project}/locations/{location}/tensorboards/{tensorboard}``
The training script should write Tensorboard to following Vertex AI environment
variable:
AIP_TENSORBOARD_LOG_DIR
`service_account` is required with provided `tensorboard`.
For more information on configuring your service account please visit:
https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
create_request_timeout (float):
Optional. The timeout for the create request in seconds.
disable_retries (bool):
Indicates if the job should retry for internal errors after the
job starts running. If True, overrides
`restart_job_on_worker_restart` to False.
max_wait_duration (int):
This is the maximum duration that a job will wait for the
requested resources to be provisioned in seconds. If set to 0,
the job will wait indefinitely. The default is 30 minutes.
Raises:
ValueError:
If both `experiment` and `tensorboard` are specified or if
`enable_autolog` is True in `CustomJob.from_local_script` but
`experiment` is not specified or the specified experiment
doesn't have a backing tensorboard.
"""
if experiment and tensorboard:
raise ValueError("'experiment' and 'tensorboard' cannot be set together.")
if self._enable_autolog and (not experiment):
raise ValueError(
"'experiment' is required since you've enabled autolog in 'from_local_script'."
)
if service_account:
self._gca_resource.job_spec.service_account = service_account
if network:
self._gca_resource.job_spec.network = network
if (
timeout
or restart_job_on_worker_restart
or disable_retries
or max_wait_duration
):
timeout = duration_pb2.Duration(seconds=timeout) if timeout else None
max_wait_duration = (
duration_pb2.Duration(seconds=max_wait_duration)
if max_wait_duration
else None
)
self._gca_resource.job_spec.scheduling = gca_custom_job_compat.Scheduling(
timeout=timeout,
restart_job_on_worker_restart=restart_job_on_worker_restart,
disable_retries=disable_retries,
max_wait_duration=max_wait_duration,
)
if enable_web_access:
self._gca_resource.job_spec.enable_web_access = enable_web_access
if tensorboard:
self._gca_resource.job_spec.tensorboard = tensorboard
# TODO(b/275105711) Update implementation after experiment/run in the proto
if experiment:
# short-term solution to set experiment/experimentRun in SDK
if isinstance(experiment, aiplatform.Experiment):
self._experiment = experiment
# convert the Experiment instance to string to be passed to env
experiment = experiment.name
else:
self._experiment = aiplatform.Experiment.get(experiment_name=experiment)
if not self._experiment:
raise ValueError(
f"Experiment '{experiment}' doesn't exist. "
"Please call aiplatform.init(experiment='my-exp') to create an experiment."
)
elif (
not self._experiment.backing_tensorboard_resource_name
and self._enable_autolog
):
raise ValueError(
f"Experiment '{experiment}' doesn't have a backing tensorboard resource, "
"which is required by the experiment autologging feature. "
"Please call Experiment.assign_backing_tensorboard('my-tb-resource-name')."
)
# if run name is not specified, auto-generate one
if not experiment_run:
experiment_run = (
# TODO(b/223262536)Once display_name is optional this run name
# might be invalid as well.
f"{self._gca_resource.display_name}-{uuid.uuid4().hex[0:5]}"
)
# get or create the experiment run for the job
if isinstance(experiment_run, aiplatform.ExperimentRun):
self._experiment_run = experiment_run
# convert the ExperimentRun instance to string to be passed to env
experiment_run = experiment_run.name
else:
self._experiment_run = aiplatform.ExperimentRun.get(
run_name=experiment_run,
experiment=self._experiment,
)
if not self._experiment_run:
self._experiment_run = aiplatform.ExperimentRun.create(
run_name=experiment_run,
experiment=self._experiment,
)
self._experiment_run.update_state(
gcs_execution_compat.Execution.State.RUNNING
)
worker_pool_specs = self._gca_resource.job_spec.worker_pool_specs
for spec in worker_pool_specs:
if not spec:
continue
if "python_package_spec" in spec:
container_spec = spec.python_package_spec
else:
container_spec = spec.container_spec
experiment_env = [
{
"name": metadata_constants.ENV_EXPERIMENT_KEY,
"value": experiment,
},
{
"name": metadata_constants.ENV_EXPERIMENT_RUN_KEY,
"value": experiment_run,
},
]
if "env" in container_spec:
container_spec.env.extend(experiment_env)
else:
container_spec.env = experiment_env
_LOGGER.log_create_with_lro(self.__class__)
self._gca_resource = self.api_client.select_version(
"v1beta1"
).create_custom_job(
parent=self._parent,
custom_job=self._gca_resource,
timeout=create_request_timeout,
)
_LOGGER.log_create_complete_with_getter(
self.__class__, self._gca_resource, "custom_job"
)
_LOGGER.info("View Custom Job:\n%s" % self._dashboard_uri())
if tensorboard:
_LOGGER.info(
"View Tensorboard:\n%s"
% console_utils.custom_job_tensorboard_console_uri(
tensorboard, self.resource_name
)
)
if experiment:
custom_job = {
metadata_constants._CUSTOM_JOB_RESOURCE_NAME: self.resource_name,
metadata_constants._CUSTOM_JOB_CONSOLE_URI: self._dashboard_uri(),
}
run_context = self._experiment_run._metadata_node
custom_jobs = run_context._gca_resource.metadata.get(
metadata_constants._CUSTOM_JOB_KEY
)
if custom_jobs:
custom_jobs.append(custom_job)
else:
custom_jobs = [custom_job]
run_context.update({metadata_constants._CUSTOM_JOB_KEY: custom_jobs})
class HyperparameterTuningJob(jobs.HyperparameterTuningJob):
"""Deprecated. Vertex AI Hyperparameter Tuning Job (preview)."""
def __init__(
self,
# TODO(b/223262536): Make display_name parameter fully optional in next major release
display_name: str,
custom_job: CustomJob,
metric_spec: Dict[str, str],
parameter_spec: Dict[str, hyperparameter_tuning._ParameterSpec],
max_trial_count: int,
parallel_trial_count: int,
max_failed_trial_count: int = 0,
search_algorithm: Optional[str] = None,
measurement_selection: Optional[str] = "best",
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
labels: Optional[Dict[str, str]] = None,
encryption_spec_key_name: Optional[str] = None,
):
"""Deprecated. Please use the GA (non-preview) version of this class.
Configures a HyperparameterTuning Job.
Example usage:
```
from google.cloud.aiplatform import hyperparameter_tuning as hpt
worker_pool_specs = [
{
"machine_spec": {
"machine_type": "n1-standard-4",
"accelerator_type": "NVIDIA_TESLA_K80",
"accelerator_count": 1,
},
"replica_count": 1,
"container_spec": {
"image_uri": container_image_uri,
"command": [],
"args": [],
},
}
]
custom_job = aiplatform.preview.jobs.CustomJob(
display_name='my_job',
worker_pool_specs=worker_pool_specs,
labels={'my_key': 'my_value'},
persistent_resource_id='my_persistent_resource',
)
hp_job = aiplatform.preview.jobs.HyperparameterTuningJob(
display_name='hp-test',
custom_job=job,
metric_spec={
'loss': 'minimize',
},
parameter_spec={
'lr': hpt.DoubleParameterSpec(min=0.001, max=0.1, scale='log'),
'units': hpt.IntegerParameterSpec(min=4, max=128, scale='linear'),
'activation': hpt.CategoricalParameterSpec(values=['relu', 'selu']),
'batch_size': hpt.DiscreteParameterSpec(values=[128, 256], scale='linear')
},
max_trial_count=128,
parallel_trial_count=8,
labels={'my_key': 'my_value'},
)
hp_job.run()
print(hp_job.trials)
```
For more information on using hyperparameter tuning please visit:
https://cloud.google.com/ai-platform-unified/docs/training/using-hyperparameter-tuning
Args:
display_name (str):
Required. The user-defined name of the HyperparameterTuningJob.
The name can be up to 128 characters long and can be consist
of any UTF-8 characters.
custom_job (aiplatform.preview.jobs.CustomJob):
Required. Configured CustomJob. The worker pool spec from this custom job
applies to the CustomJobs created in all the trials. A persistent_resource_id can be
specified on the custom job to be used when running this Hyperparameter Tuning job.
metric_spec: Dict[str, str]
Required. Dictionary representing metrics to optimize. The dictionary key is the metric_id,
which is reported by your training job, and the dictionary value is the
optimization goal of the metric('minimize' or 'maximize'). example:
metric_spec = {'loss': 'minimize', 'accuracy': 'maximize'}
parameter_spec (Dict[str, hyperparameter_tuning._ParameterSpec]):
Required. Dictionary representing parameters to optimize. The dictionary key is the metric_id,
which is passed into your training job as a command line key word argument, and the
dictionary value is the parameter specification of the metric.
from google.cloud.aiplatform import hyperparameter_tuning as hpt
parameter_spec={
'decay': hpt.DoubleParameterSpec(min=1e-7, max=1, scale='linear'),
'learning_rate': hpt.DoubleParameterSpec(min=1e-7, max=1, scale='linear')
'batch_size': hpt.DiscreteParamterSpec(values=[4, 8, 16, 32, 64, 128], scale='linear')
}
Supported parameter specifications can be found until aiplatform.hyperparameter_tuning.
These parameter specification are currently supported:
DoubleParameterSpec, IntegerParameterSpec, CategoricalParameterSpace, DiscreteParameterSpec
max_trial_count (int):
Required. The desired total number of Trials.
parallel_trial_count (int):
Required. The desired number of Trials to run in parallel.
max_failed_trial_count (int):
Optional. The number of failed Trials that need to be
seen before failing the HyperparameterTuningJob.
If set to 0, Vertex AI decides how many Trials
must fail before the whole job fails.
search_algorithm (str):
The search algorithm specified for the Study.
Accepts one of the following:
`None` - If you do not specify an algorithm, your job uses
the default Vertex AI algorithm. The default algorithm
applies Bayesian optimization to arrive at the optimal
solution with a more effective search over the parameter space.
'grid' - A simple grid search within the feasible space. This
option is particularly useful if you want to specify a quantity
of trials that is greater than the number of points in the
feasible space. In such cases, if you do not specify a grid
search, the Vertex AI default algorithm may generate duplicate
suggestions. To use grid search, all parameter specs must be
of type `IntegerParameterSpec`, `CategoricalParameterSpace`,
or `DiscreteParameterSpec`.
'random' - A simple random search within the feasible space.
measurement_selection (str):
This indicates which measurement to use if/when the service
automatically selects the final measurement from previously reported
intermediate measurements.
Accepts: 'best', 'last'
Choose this based on two considerations:
A) Do you expect your measurements to monotonically improve? If so,
choose 'last'. On the other hand, if you're in a situation
where your system can "over-train" and you expect the performance to
get better for a while but then start declining, choose
'best'. B) Are your measurements significantly noisy
and/or irreproducible? If so, 'best' will tend to be
over-optimistic, and it may be better to choose 'last'. If
both or neither of (A) and (B) apply, it doesn't matter which
selection type is chosen.
project (str):
Optional. Project to run the HyperparameterTuningjob in. Overrides project set in aiplatform.init.
location (str):
Optional. Location to run the HyperparameterTuning in. Overrides location set in aiplatform.init.
credentials (auth_credentials.Credentials):
Optional. Custom credentials to use to run call HyperparameterTuning service. Overrides
credentials set in aiplatform.init.
labels (Dict[str, str]):
Optional. The labels with user-defined metadata to
organize HyperparameterTuningJobs.
Label keys and values can be no longer than 64
characters (Unicode codepoints), can only
contain lowercase letters, numeric characters,
underscores and dashes. International characters
are allowed.
See https://goo.gl/xmQnxf for more information
and examples of labels.
encryption_spec_key_name (str):
Optional. Customer-managed encryption key options for a
HyperparameterTuningJob. If this is set, then
all resources created by the
HyperparameterTuningJob will be encrypted with
the provided encryption key.
"""
super(jobs.HyperparameterTuningJob, self).__init__(
project=project, location=location, credentials=credentials
)
metrics = [
study_v1beta1.StudySpec.MetricSpec(metric_id=metric_id, goal=goal.upper())
for metric_id, goal in metric_spec.items()
]
parameters = [
parameter._to_parameter_spec_v1beta1(parameter_id=parameter_id)
for parameter_id, parameter in parameter_spec.items()
]
study_spec = study_v1beta1.StudySpec(
metrics=metrics,
parameters=parameters,
algorithm=hyperparameter_tuning.SEARCH_ALGORITHM_TO_PROTO_VALUE[
search_algorithm
],
measurement_selection_type=hyperparameter_tuning.MEASUREMENT_SELECTION_TO_PROTO_VALUE[
measurement_selection
],
)
if not display_name:
display_name = self.__class__._generate_display_name()
self._gca_resource = (
gca_hyperparameter_tuning_job_compat.HyperparameterTuningJob(
display_name=display_name,
study_spec=study_spec,
max_trial_count=max_trial_count,
parallel_trial_count=parallel_trial_count,
max_failed_trial_count=max_failed_trial_count,
trial_job_spec=copy.deepcopy(custom_job.job_spec),
labels=labels,
encryption_spec=initializer.global_config.get_encryption_spec(
encryption_spec_key_name=encryption_spec_key_name,
select_version=compat.V1BETA1,
),
)
)
def _get_gca_resource(
self,
resource_name: str,
parent_resource_name_fields: Optional[Dict[str, str]] = None,
) -> proto.Message:
"""Returns GAPIC service representation of client class resource.
Args:
resource_name (str): Required. A fully-qualified resource name or ID.
parent_resource_name_fields (Dict[str,str]):
Optional. Mapping of parent resource name key to values. These
will be used to compose the resource name if only resource ID is given.
Should not include project and location.
"""
resource_name = utils.full_resource_name(
resource_name=resource_name,
resource_noun=self._resource_noun,
parse_resource_name_method=self._parse_resource_name,
format_resource_name_method=self._format_resource_name,
project=self.project,
location=self.location,
parent_resource_name_fields=parent_resource_name_fields,
resource_id_validator=self._resource_id_validator,
)
return getattr(self.api_client.select_version("v1beta1"), self._getter_method)(
name=resource_name, retry=_DEFAULT_RETRY
)
@base.optional_sync()
def _run(
self,
service_account: Optional[str] = None,
network: Optional[str] = None,
timeout: Optional[int] = None, # seconds
restart_job_on_worker_restart: bool = False,
enable_web_access: bool = False,
tensorboard: Optional[str] = None,
sync: bool = True,
create_request_timeout: Optional[float] = None,
disable_retries: bool = False,
max_wait_duration: Optional[int] = None,
) -> None:
"""Helper method to ensure network synchronization and to run the configured CustomJob.
Args:
service_account (str):
Optional. Specifies the service account for workload run-as account.
Users submitting jobs must have act-as permission on this run-as account.
network (str):
Optional. The full name of the Compute Engine network to which the job
should be peered. For example, projects/12345/global/networks/myVPC.
Private services access must already be configured for the network.
timeout (int):
Optional. The maximum job running time in seconds. The default is 7 days.
restart_job_on_worker_restart (bool):
Restarts the entire CustomJob if a worker
gets restarted. This feature can be used by
distributed training jobs that are not resilient
to workers leaving and joining a job.
enable_web_access (bool):
Whether you want Vertex AI to enable interactive shell access
to training containers.
https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell
tensorboard (str):
Optional. The name of a Vertex AI
[Tensorboard][google.cloud.aiplatform.v1beta1.Tensorboard]
resource to which this CustomJob will upload Tensorboard
logs. Format:
``projects/{project}/locations/{location}/tensorboards/{tensorboard}``
The training script should write Tensorboard to following Vertex AI environment
variable:
AIP_TENSORBOARD_LOG_DIR
`service_account` is required with provided `tensorboard`.
For more information on configuring your service account please visit:
https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
sync (bool):
Whether to execute this method synchronously. If False, this method
will unblock and it will be executed in a concurrent Future.
create_request_timeout (float):
Optional. The timeout for the create request in seconds.
disable_retries (bool):
Indicates if the job should retry for internal errors after the
job starts running. If True, overrides
`restart_job_on_worker_restart` to False.
max_wait_duration (int):
This is the maximum duration that a job will wait for the
requested resources to be provisioned in seconds. If set to 0,
the job will wait indefinitely. The default is 30 minutes.
"""
if service_account:
self._gca_resource.trial_job_spec.service_account = service_account
if network:
self._gca_resource.trial_job_spec.network = network
if (
timeout
or restart_job_on_worker_restart
or disable_retries
or max_wait_duration
):
timeout = duration_pb2.Duration(seconds=timeout) if timeout else None
max_wait_duration = (
duration_pb2.Duration(seconds=max_wait_duration)
if max_wait_duration
else None
)
self._gca_resource.trial_job_spec.scheduling = (
gca_custom_job_compat.Scheduling(
timeout=timeout,
restart_job_on_worker_restart=restart_job_on_worker_restart,
disable_retries=disable_retries,
max_wait_duration=max_wait_duration,
)
)
if enable_web_access:
self._gca_resource.trial_job_spec.enable_web_access = enable_web_access
if tensorboard:
self._gca_resource.trial_job_spec.tensorboard = tensorboard
_LOGGER.log_create_with_lro(self.__class__)
self._gca_resource = self.api_client.select_version(
"v1beta1"
).create_hyperparameter_tuning_job(
parent=self._parent,
hyperparameter_tuning_job=self._gca_resource,
timeout=create_request_timeout,
)
_LOGGER.log_create_complete_with_getter(
self.__class__, self._gca_resource, "hpt_job"
)
_LOGGER.info("View HyperparameterTuningJob:\n%s" % self._dashboard_uri())
if tensorboard:
_LOGGER.info(
"View Tensorboard:\n%s"
% console_utils.custom_job_tensorboard_console_uri(
tensorboard, self.resource_name
)
)
self._block_until_complete()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,430 @@
# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Dict, List, Optional, Union
from google.api_core import operation
from google.api_core import retry
from google.auth import credentials as auth_credentials
from google.cloud.aiplatform import base
from google.cloud.aiplatform import initializer
from google.cloud.aiplatform import utils
from google.cloud.aiplatform.compat.services import (
persistent_resource_service_client_v1beta1 as persistent_resource_service_client_compat,
)
from google.cloud.aiplatform_v1beta1.types import (
encryption_spec as gca_encryption_spec_compat,
)
from google.cloud.aiplatform_v1beta1.types import (
persistent_resource as gca_persistent_resource_compat,
)
from google.protobuf import timestamp_pb2 # type: ignore
from google.rpc import status_pb2 # type: ignore
_LOGGER = base.Logger(__name__)
_DEFAULT_RETRY = retry.Retry()
class PersistentResource(base.VertexAiResourceNounWithFutureManager):
"""Managed PersistentResource feature for Vertex AI (Preview)."""
client_class = utils.PersistentResourceClientWithOverride
_resource_noun = "persistentResource"
_getter_method = "get_persistent_resource"
_list_method = "list_persistent_resources"
_delete_method = "delete_persistent_resource"
_parse_resource_name_method = "parse_persistent_resource_path"
_format_resource_name_method = "persistent_resource_path"
def __init__(
self,
persistent_resource_id: str,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
):
"""Retrieves the PersistentResource and instantiates its representation.
Args:
persistent_resource_id (str):
Required.
project (str):
Project this PersistentResource is in. Overrides
project set in aiplatform.init.
location (str):
Location this PersistentResource is in. Overrides
location set in aiplatform.init.
credentials (auth_credentials.Credentials):
Custom credentials to use to manage this PersistentResource.
Overrides credentials set in aiplatform.init.
"""
super().__init__(
project=project,
location=location,
credentials=credentials,
resource_name=persistent_resource_id,
)
self._gca_resource = self._get_gca_resource(
resource_name=persistent_resource_id
)
@property
def display_name(self) -> Optional[str]:
"""The display name of the PersistentResource."""
self._assert_gca_resource_is_available()
return getattr(self._gca_resource, "display_name", None)
@property
def state(self) -> gca_persistent_resource_compat.PersistentResource.State:
"""The state of the PersistentResource.
Values:
STATE_UNSPECIFIED (0):
Not set.
PROVISIONING (1):
The PROVISIONING state indicates the
persistent resources is being created.
RUNNING (3):
The RUNNING state indicates the persistent
resources is healthy and fully usable.
STOPPING (4):
The STOPPING state indicates the persistent
resources is being deleted.
ERROR (5):
The ERROR state indicates the persistent resources may be
unusable. Details can be found in the ``error`` field.
"""
self._assert_gca_resource_is_available()
return getattr(self._gca_resource, "state", None)
@property
def error(self) -> Optional[status_pb2.Status]:
"""The error status of the PersistentResource.
Only populated when the resource's state is ``STOPPING`` or ``ERROR``.
"""
self._assert_gca_resource_is_available()
return getattr(self._gca_resource, "error", None)
@property
def create_time(self) -> Optional[timestamp_pb2.Timestamp]:
"""Time when the PersistentResource was created."""
self._assert_gca_resource_is_available()
return getattr(self._gca_resource, "create_time", None)
@property
def start_time(self) -> Optional[timestamp_pb2.Timestamp]:
"""Time when the PersistentResource first entered the ``RUNNING`` state."""
self._assert_gca_resource_is_available()
return getattr(self._gca_resource, "start_time", None)
@property
def update_time(self) -> Optional[timestamp_pb2.Timestamp]:
"""Time when the PersistentResource was most recently updated."""
self._assert_gca_resource_is_available()
return getattr(self._gca_resource, "update_time", None)
@property
def network(self) -> Optional[str]:
"""The network peered with the PersistentResource.
The full name of the Compute Engine
`network </compute/docs/networks-and-firewalls#networks>`__ to peered
with Vertex AI to host the persistent resources.
For example, ``projects/12345/global/networks/myVPC``.
`Format </compute/docs/reference/rest/v1/networks/insert>`__ is of the
form ``projects/{project}/global/networks/{network}``. Where {project}
is a project number, as in ``12345``, and {network} is a network name.
To specify this field, you must have already `configured VPC Network
Peering for Vertex
AI <https://cloud.google.com/vertex-ai/docs/general/vpc-peering>`__.
If this field is left unspecified, the resources aren't peered with any
network.
"""
self._assert_gca_resource_is_available()
return getattr(self._gca_resource, "network", None)
@classmethod
@base.optional_sync()
def create(
cls,
persistent_resource_id: str,
resource_pools: Union[
List[Dict], List[gca_persistent_resource_compat.ResourcePool]
],
display_name: Optional[str] = None,
labels: Optional[Dict[str, str]] = None,
network: Optional[str] = None,
kms_key_name: Optional[str] = None,
service_account: Optional[str] = None,
reserved_ip_ranges: List[str] = None,
sync: Optional[bool] = True, # pylint: disable=unused-argument
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
) -> "PersistentResource":
r"""Creates a PersistentResource.
Args:
persistent_resource_id (str):
Required. The ID to use for the PersistentResource,
which become the final component of the
PersistentResource's resource name.
The maximum length is 63 characters, and valid
characters are ``/^[a-z]([a-z0-9-]{0,61}[a-z0-9])?$/``.
This corresponds to the ``persistent_resource_id`` field
on the ``request`` instance; if ``request`` is provided, this
should not be set.
resource_pools (MutableSequence[google.cloud.aiplatform_v1.types.ResourcePool]):
Required. The list of resource pools to create for the
PersistentResource.
display_name (str):
Optional. The display name of the
PersistentResource. The name can be up to 128
characters long and can consist of any UTF-8
characters.
labels (MutableMapping[str, str]):
Optional. The labels with user-defined
metadata to organize PersistentResource.
Label keys and values can be no longer than 64
characters (Unicode codepoints), can only
contain lowercase letters, numeric characters,
underscores and dashes. International characters
are allowed.
See https://goo.gl/xmQnxf for more information
and examples of labels.
network (str):
Optional. The full name of the Compute Engine
`network </compute/docs/networks-and-firewalls#networks>`__
to peered with Vertex AI to host the persistent resources.
For example, ``projects/12345/global/networks/myVPC``.
`Format </compute/docs/reference/rest/v1/networks/insert>`__
is of the form
``projects/{project}/global/networks/{network}``. Where
{project} is a project number, as in ``12345``, and
{network} is a network name.
To specify this field, you must have already `configured VPC
Network Peering for Vertex
AI <https://cloud.google.com/vertex-ai/docs/general/vpc-peering>`__.
If this field is left unspecified, the resources aren't
peered with any network.
kms_key_name (str):
Optional. Customer-managed encryption key for the
PersistentResource. If set, this PersistentResource and all
sub-resources of this PersistentResource will be secured by
this key.
service_account (str):
Optional. Default service account that this
PersistentResource's workloads run as. The workloads
including
- Any runtime specified via ``ResourceRuntimeSpec`` on
creation time, for example, Ray.
- Jobs submitted to PersistentResource, if no other service
account specified in the job specs.
Only works when custom service account is enabled and users
have the ``iam.serviceAccounts.actAs`` permission on this
service account.
reserved_ip_ranges (MutableSequence[str]):
Optional. A list of names for the reserved IP ranges under
the VPC network that can be used for this persistent
resource.
If set, we will deploy the persistent resource within the
provided IP ranges. Otherwise, the persistent resource is
deployed to any IP ranges under the provided VPC network.
Example ['vertex-ai-ip-range'].
sync (bool):
Whether to execute this method synchonously. If False, this
method will be executed in concurrent Future and any downstream
object will be immediately returned and synced when the Future
has completed.
project (str):
Project to create this PersistentResource in. Overrides project
set in aiplatform.init.
location (str):
Location to create this PersistentResource in. Overrides
location set in aiplatform.init.
credentials (auth_credentials.Credentials):
Custom credentials to use to create this PersistentResource.
Overrides credentials set in aiplatform.init.
Returns:
persistent_resource (PersistentResource):
The object representation of the newly created
PersistentResource.
"""
if labels:
utils.validate_labels(labels)
gca_persistent_resource = gca_persistent_resource_compat.PersistentResource(
name=persistent_resource_id,
display_name=display_name,
resource_pools=resource_pools,
labels=labels,
network=network,
reserved_ip_ranges=reserved_ip_ranges,
)
if kms_key_name:
gca_persistent_resource.encryption_spec = (
gca_encryption_spec_compat.EncryptionSpec(kms_key_name=kms_key_name)
)
if service_account:
service_account_spec = gca_persistent_resource_compat.ServiceAccountSpec(
enable_custom_service_account=True, service_account=service_account
)
gca_persistent_resource.resource_runtime_spec = (
gca_persistent_resource_compat.ResourceRuntimeSpec(
service_account_spec=service_account_spec
)
)
api_client = cls._instantiate_client(location, credentials).select_version(
"v1beta1"
)
create_lro = cls._create(
api_client=api_client,
parent=initializer.global_config.common_location_path(
project=project, location=location
),
persistent_resource=gca_persistent_resource,
persistent_resource_id=persistent_resource_id,
)
_LOGGER.log_create_with_lro(cls, create_lro)
create_lro.result(timeout=None)
persistent_resource_result = cls(
persistent_resource_id=persistent_resource_id,
project=project,
location=location,
credentials=credentials,
)
_LOGGER.log_create_complete(
cls, persistent_resource_result._gca_resource, "persistent resource"
)
return persistent_resource_result
@classmethod
def _create(
cls,
api_client: (
persistent_resource_service_client_compat.PersistentResourceServiceClient
),
parent: str,
persistent_resource: gca_persistent_resource_compat.PersistentResource,
persistent_resource_id: str,
create_request_timeout: Optional[float] = None,
) -> operation.Operation:
"""Creates a PersistentResource directly calling the API client.
Args:
api_client (PersistentResourceServiceClient):
An instance of PersistentResourceServiceClient with the correct
api_endpoint already set based on user's preferences.
parent (str):
Required. Also known as common location path, that usually contains the
project and location that the user provided to the upstream method.
IE "projects/my-project/locations/us-central1"
persistent_resource (gca_persistent_resource_compat.PersistentResource):
Required. The PersistentResource object to use for the create request.
persistent_resource_id (str):
Required. The ID to use for the PersistentResource,
which become the final component of the
PersistentResource's resource name.
The maximum length is 63 characters, and valid
characters are ``/^[a-z]([a-z0-9-]{0,61}[a-z0-9])?$/``.
This corresponds to the ``persistent_resource_id`` field
on the ``request`` instance; if ``request`` is provided, this
should not be set.
create_request_timeout (float):
Optional. The timeout for the create request in seconds.
Returns:
operation (Operation):
The long-running operation returned by the Persistent Resource
create call.
"""
return api_client.create_persistent_resource(
parent=parent,
persistent_resource_id=persistent_resource_id,
persistent_resource=persistent_resource,
timeout=create_request_timeout,
)
@classmethod
def list(
cls,
filter: Optional[str] = None,
order_by: Optional[str] = None,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
) -> List["PersistentResource"]:
"""Lists a Persistent Resources on the provided project and region.
Args:
filter (str):
Optional. An expression for filtering the results of the request.
For field names both snake_case and camelCase are supported.
order_by (str):
Optional. A comma-separated list of fields to order by, sorted in
ascending order. Use "desc" after a field name for descending.
Supported fields: `display_name`, `create_time`, `update_time`
project (str):
Optional. Project to retrieve list from. If not set, project
set in aiplatform.init will be used.
location (str):
Optional. Location to retrieve list from. If not set, location
set in aiplatform.init will be used.
credentials (auth_credentials.Credentials):
Optional. Custom credentials to use to retrieve list. Overrides
credentials set in aiplatform.init.
Returns:
List[PersistentResource]
A list of PersistentResource objects.
"""
return cls._list_with_local_order(
filter=filter,
order_by=order_by,
project=project,
location=location,
credentials=credentials,
)

View File

@@ -0,0 +1,615 @@
# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import datetime
import re
from typing import Any, Dict, List, Optional
from google.auth import credentials as auth_credentials
from google.cloud import aiplatform_v1beta1
from google.cloud.aiplatform import base
from google.cloud.aiplatform import compat
from google.cloud.aiplatform import initializer
from google.cloud.aiplatform import pipeline_job_schedules
from google.cloud.aiplatform import utils
from google.cloud.aiplatform.constants import pipeline as pipeline_constants
from google.cloud.aiplatform.metadata import constants as metadata_constants
from google.cloud.aiplatform.metadata import experiment_resources
from google.cloud.aiplatform.pipeline_jobs import (
PipelineJob as PipelineJobGa,
)
from google.cloud.aiplatform_v1.services.pipeline_service import (
PipelineServiceClient as PipelineServiceClientGa,
)
from google.protobuf import json_format
_LOGGER = base.Logger(__name__)
# Pattern for valid names used as a Vertex resource name.
_VALID_NAME_PATTERN = pipeline_constants._VALID_NAME_PATTERN
# Pattern for an Artifact Registry URL.
_VALID_AR_URL = pipeline_constants._VALID_AR_URL
# Pattern for any JSON or YAML file over HTTPS.
_VALID_HTTPS_URL = pipeline_constants._VALID_HTTPS_URL
def _get_current_time() -> datetime.datetime:
"""Gets the current timestamp."""
return datetime.datetime.now()
def _set_enable_caching_value(
pipeline_spec: Dict[str, Any], enable_caching: bool
) -> None:
"""Sets pipeline tasks caching options.
Args:
pipeline_spec (Dict[str, Any]):
Required. The dictionary of pipeline spec.
enable_caching (bool):
Required. Whether to enable caching.
"""
for component in [pipeline_spec["root"]] + list(
pipeline_spec["components"].values()
):
if "dag" in component:
for task in component["dag"]["tasks"].values():
task["cachingOptions"] = {"enableCache": enable_caching}
class _PipelineJob(
PipelineJobGa,
experiment_loggable_schemas=(
experiment_resources._ExperimentLoggableSchema(
title=metadata_constants.SYSTEM_PIPELINE_RUN
),
),
):
"""Preview PipelineJob resource for Vertex AI."""
def __init__(
self,
display_name: str,
template_path: str,
job_id: Optional[str] = None,
pipeline_root: Optional[str] = None,
parameter_values: Optional[Dict[str, Any]] = None,
input_artifacts: Optional[Dict[str, str]] = None,
enable_caching: Optional[bool] = None,
encryption_spec_key_name: Optional[str] = None,
labels: Optional[Dict[str, str]] = None,
credentials: Optional[auth_credentials.Credentials] = None,
project: Optional[str] = None,
location: Optional[str] = None,
failure_policy: Optional[str] = None,
enable_preflight_validations: Optional[bool] = False,
default_runtime: Optional[Dict[str, Any]] = None,
):
"""Retrieves a PipelineJob resource and instantiates its
representation.
Args:
display_name (str):
Required. The user-defined name of this Pipeline.
template_path (str):
Required. The path of PipelineJob or PipelineSpec JSON or YAML file. It
can be a local path, a Google Cloud Storage URI (e.g. "gs://project.name"),
an Artifact Registry URI (e.g.
"https://us-central1-kfp.pkg.dev/proj/repo/pack/latest"), or an HTTPS URI.
job_id (str):
Optional. The unique ID of the job run.
If not specified, pipeline name + timestamp will be used.
pipeline_root (str):
Optional. The root of the pipeline outputs. If not set, the staging bucket
set in aiplatform.init will be used. If that's not set a pipeline-specific
artifacts bucket will be used.
parameter_values (Dict[str, Any]):
Optional. The mapping from runtime parameter names to its values that
control the pipeline run.
input_artifacts (Dict[str, str]):
Optional. The mapping from the runtime parameter name for this artifact to its resource id.
For example: "vertex_model":"456". Note: full resource name ("projects/123/locations/us-central1/metadataStores/default/artifacts/456") cannot be used.
enable_caching (bool):
Optional. Whether to turn on caching for the run.
If this is not set, defaults to the compile time settings, which
are True for all tasks by default, while users may specify
different caching options for individual tasks.
If this is set, the setting applies to all tasks in the pipeline.
Overrides the compile time settings.
encryption_spec_key_name (str):
Optional. The Cloud KMS resource identifier of the customer
managed encryption key used to protect the job. Has the
form:
``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``.
The key needs to be in the same region as where the compute
resource is created.
If this is set, then all
resources created by the PipelineJob will
be encrypted with the provided encryption key.
Overrides encryption_spec_key_name set in aiplatform.init.
labels (Dict[str, str]):
Optional. The user defined metadata to organize PipelineJob.
credentials (auth_credentials.Credentials):
Optional. Custom credentials to use to create this PipelineJob.
Overrides credentials set in aiplatform.init.
project (str):
Optional. The project that you want to run this PipelineJob in. If not set,
the project set in aiplatform.init will be used.
location (str):
Optional. Location to create PipelineJob. If not set,
location set in aiplatform.init will be used.
failure_policy (str):
Optional. The failure policy - "slow" or "fast".
Currently, the default of a pipeline is that the pipeline will continue to
run until no more tasks can be executed, also known as
PIPELINE_FAILURE_POLICY_FAIL_SLOW (corresponds to "slow").
However, if a pipeline is set to
PIPELINE_FAILURE_POLICY_FAIL_FAST (corresponds to "fast"),
it will stop scheduling any new tasks when a task has failed. Any
scheduled tasks will continue to completion.
enable_preflight_validations (bool):
Optional. Whether to enable preflight validations or not.
default_runtime (Dict[str, Any]):
Optional. Specifies the runtime for the entire pipeline.
All tasks will use the configured runtime unless overridden at the task level.
If not provided, Vertex Training Custom Job (on-demand) will be used as the default runtime.
Supported Runtimes:
- Custom Job(On-Demand) Runtime: Default if default_runtime is not provided or None.
- Persistent Resource Runtime: To use a persistent resource as the runtime, see reference configuration below:
default_runtime = {
"persistentResourceRuntimeDetail": {
"persistentResourceName": "projects/my-project/locations/my-location/persistentResources/my-persistent",
"taskResourceUnavailableWaitTimeMs": 1000, # Time (ms) to wait if resource is unavailable
"taskResourceUnavailableTimeoutBehavior": "FAIL", # Behavior if resource is unavailable after wait
}
}
For more information, please see https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.pipelineJobs#PipelineJob.DefaultRuntime.
Raises:
ValueError: If job_id or labels have incorrect format.
"""
super().__init__(
display_name=display_name,
template_path=template_path,
job_id=job_id,
pipeline_root=pipeline_root,
parameter_values=parameter_values,
input_artifacts=input_artifacts,
enable_caching=enable_caching,
encryption_spec_key_name=encryption_spec_key_name,
labels=labels,
credentials=credentials,
project=project,
location=location,
failure_policy=failure_policy,
)
# needs to rebuild the v1beta version of pipeline_job and runtime_config
pipeline_json = utils.yaml_utils.load_yaml(
template_path, self.project, self.credentials
)
# Pipeline_json can be either PipelineJob or PipelineSpec.
if pipeline_json.get("pipelineSpec") is not None:
pipeline_job = pipeline_json
pipeline_root = (
pipeline_root
or pipeline_job["pipelineSpec"].get("defaultPipelineRoot")
or pipeline_job["runtimeConfig"].get("gcsOutputDirectory")
or initializer.global_config.staging_bucket
)
else:
pipeline_job = {
"pipelineSpec": pipeline_json,
"runtimeConfig": {},
}
pipeline_root = (
pipeline_root
or pipeline_job["pipelineSpec"].get("defaultPipelineRoot")
or initializer.global_config.staging_bucket
)
pipeline_root = (
pipeline_root
or utils.gcs_utils.generate_gcs_directory_for_pipeline_artifacts(
project=project,
location=location,
)
)
builder = utils.pipeline_utils.PipelineRuntimeConfigBuilder.from_job_spec_json(
pipeline_job
)
builder.update_pipeline_root(pipeline_root)
builder.update_runtime_parameters(parameter_values)
builder.update_input_artifacts(input_artifacts)
builder.update_failure_policy(failure_policy)
builder.update_default_runtime(default_runtime)
runtime_config_dict = builder.build()
runtime_config = aiplatform_v1beta1.PipelineJob.RuntimeConfig()._pb
json_format.ParseDict(runtime_config_dict, runtime_config)
pipeline_name = pipeline_job["pipelineSpec"]["pipelineInfo"]["name"]
self.job_id = job_id or "{pipeline_name}-{timestamp}".format(
pipeline_name=re.sub("[^-0-9a-z]+", "-", pipeline_name.lower())
.lstrip("-")
.rstrip("-"),
timestamp=_get_current_time().strftime("%Y%m%d%H%M%S"),
)
if not _VALID_NAME_PATTERN.match(self.job_id):
raise ValueError(
f"Generated job ID: {self.job_id} is illegal as a Vertex pipelines job ID. "
"Expecting an ID following the regex pattern "
f'"{_VALID_NAME_PATTERN.pattern[1:-1]}"'
)
if enable_caching is not None:
_set_enable_caching_value(pipeline_job["pipelineSpec"], enable_caching)
pipeline_job_args = {
"display_name": display_name,
"pipeline_spec": pipeline_job["pipelineSpec"],
"labels": labels,
"runtime_config": runtime_config,
"encryption_spec": initializer.global_config.get_encryption_spec(
encryption_spec_key_name=encryption_spec_key_name
),
"preflight_validations": enable_preflight_validations,
}
if _VALID_AR_URL.match(template_path) or _VALID_HTTPS_URL.match(template_path):
pipeline_job_args["template_uri"] = template_path
self._v1_beta1_pipeline_job = aiplatform_v1beta1.PipelineJob(
**pipeline_job_args
)
def create_schedule(
self,
cron_expression: str,
display_name: str,
start_time: Optional[str] = None,
end_time: Optional[str] = None,
allow_queueing: bool = False,
max_run_count: Optional[int] = None,
max_concurrent_run_count: int = 1,
service_account: Optional[str] = None,
network: Optional[str] = None,
create_request_timeout: Optional[float] = None,
) -> "pipeline_job_schedules.PipelineJobSchedule": # noqa: F821
"""Creates a PipelineJobSchedule directly from a PipelineJob.
Example Usage:
pipeline_job = aiplatform.PipelineJob(
display_name='job_display_name',
template_path='your_pipeline.yaml',
)
pipeline_job.run()
pipeline_job_schedule = pipeline_job.create_schedule(
cron_expression='* * * * *',
display_name='schedule_display_name',
)
Args:
cron_expression (str):
Required. Time specification (cron schedule expression) to launch scheduled runs.
To explicitly set a timezone to the cron tab, apply a prefix: "CRON_TZ=${IANA_TIME_ZONE}" or "TZ=${IANA_TIME_ZONE}".
The ${IANA_TIME_ZONE} may only be a valid string from IANA time zone database.
For example, "CRON_TZ=America/New_York 1 * * * *", or "TZ=America/New_York 1 * * * *".
display_name (str):
Required. The user-defined name of this PipelineJobSchedule.
start_time (str):
Optional. Timestamp after which the first run can be scheduled.
If unspecified, it defaults to the schedule creation timestamp.
end_time (str):
Optional. Timestamp after which no more runs will be scheduled.
If unspecified, then runs will be scheduled indefinitely.
allow_queueing (bool):
Optional. Whether new scheduled runs can be queued when max_concurrent_runs limit is reached.
max_run_count (int):
Optional. Maximum run count of the schedule.
If specified, The schedule will be completed when either started_run_count >= max_run_count or when end_time is reached.
Must be positive and <= 2^63-1.
max_concurrent_run_count (int):
Optional. Maximum number of runs that can be started concurrently for this PipelineJobSchedule.
service_account (str):
Optional. Specifies the service account for workload run-as account.
Users submitting jobs must have act-as permission on this run-as account.
network (str):
Optional. The full name of the Compute Engine network to which the job
should be peered. For example, projects/12345/global/networks/myVPC.
Private services access must already be configured for the network.
If left unspecified, the network set in aiplatform.init will be used.
Otherwise, the job is not peered with any network.
create_request_timeout (float):
Optional. The timeout for the create request in seconds.
Returns:
A Vertex AI PipelineJobSchedule.
"""
return super().create_schedule(
cron=cron_expression,
display_name=display_name,
start_time=start_time,
end_time=end_time,
allow_queueing=allow_queueing,
max_run_count=max_run_count,
max_concurrent_run_count=max_concurrent_run_count,
service_account=service_account,
network=network,
create_request_timeout=create_request_timeout,
)
@classmethod
def batch_delete(
cls,
names: List[str],
project: Optional[str] = None,
location: Optional[str] = None,
) -> aiplatform_v1beta1.BatchDeletePipelineJobsResponse:
"""
Example Usage:
aiplatform.init(
project='your_project_name',
location='your_location',
)
aiplatform.PipelineJob.batch_delete(
names=['pipeline_job_name', 'pipeline_job_name2']
)
Args:
names (List[str]):
Required. The fully-qualified resource name or ID of the
Pipeline Jobs to batch delete. Example:
"projects/123/locations/us-central1/pipelineJobs/456"
or "456" when project and location are initialized or passed.
project (str):
Optional. Project containing the Pipeline Jobs to
batch delete. If not set, the project given to `aiplatform.init`
will be used.
location (str):
Optional. Location containing the Pipeline Jobs to
batch delete. If not set, the location given to `aiplatform.init`
will be used.
Returns:
BatchDeletePipelineJobsResponse contains PipelineJobs deleted.
"""
user_project = project or initializer.global_config.project
user_location = location or initializer.global_config.location
parent = initializer.global_config.common_location_path(
project=user_project, location=user_location
)
pipeline_jobs_names = [
utils.full_resource_name(
resource_name=name,
resource_noun="pipelineJobs",
parse_resource_name_method=PipelineServiceClientGa.parse_pipeline_job_path,
format_resource_name_method=PipelineServiceClientGa.pipeline_job_path,
project=user_project,
location=user_location,
)
for name in names
]
request = aiplatform_v1beta1.BatchDeletePipelineJobsRequest(
parent=parent, names=pipeline_jobs_names
)
client = cls._instantiate_client(
location=user_location,
appended_user_agent=["preview-pipeline-jobs-batch-delete"],
)
v1beta1_client = client.select_version(compat.V1BETA1)
operation = v1beta1_client.batch_delete_pipeline_jobs(request)
return operation.result()
def submit(
self,
service_account: Optional[str] = None,
network: Optional[str] = None,
reserved_ip_ranges: Optional[List[str]] = None,
create_request_timeout: Optional[float] = None,
job_id: Optional[str] = None,
) -> None:
"""Run this configured PipelineJob.
Args:
service_account (str):
Optional. Specifies the service account for workload run-as account.
Users submitting jobs must have act-as permission on this run-as account.
network (str):
Optional. The full name of the Compute Engine network to which the job
should be peered. For example, projects/12345/global/networks/myVPC.
Private services access must already be configured for the network.
If left unspecified, the network set in aiplatform.init will be used.
Otherwise, the job is not peered with any network.
reserved_ip_ranges (List[str]):
Optional. A list of names for the reserved IP ranges under the VPC
network that can be used for this PipelineJob's workload. For example: ['vertex-ai-ip-range'].
If left unspecified, the job will be deployed to any IP ranges under
the provided VPC network.
create_request_timeout (float):
Optional. The timeout for the create request in seconds.
job_id (str):
Optional. The ID to use for the PipelineJob, which will become the final
component of the PipelineJob name. If not provided, an ID will be
automatically generated.
"""
network = network or initializer.global_config.network
service_account = service_account or initializer.global_config.service_account
gca_resouce = self._v1_beta1_pipeline_job
if service_account:
gca_resouce.service_account = service_account
if network:
gca_resouce.network = network
if reserved_ip_ranges:
gca_resouce.reserved_ip_ranges = reserved_ip_ranges
user_project = initializer.global_config.project
user_location = initializer.global_config.location
parent = initializer.global_config.common_location_path(
project=user_project, location=user_location
)
client = self._instantiate_client(
location=user_location,
appended_user_agent=["preview-pipeline-job-submit"],
)
v1beta1_client = client.select_version(compat.V1BETA1)
_LOGGER.log_create_with_lro(self.__class__)
request = aiplatform_v1beta1.CreatePipelineJobRequest(
parent=parent,
pipeline_job=self._v1_beta1_pipeline_job,
pipeline_job_id=job_id or self.job_id,
)
response = v1beta1_client.create_pipeline_job(request=request)
self._gca_resource = response
_LOGGER.log_create_complete_with_getter(
self.__class__, self._gca_resource, "pipeline_job"
)
_LOGGER.info("View Pipeline Job:\n%s" % self._dashboard_uri())
def rerun(
self,
original_pipelinejob_name: str,
pipeline_task_rerun_configs: Optional[
List[aiplatform_v1beta1.PipelineTaskRerunConfig]
] = None,
parameter_values: Optional[Dict[str, Any]] = None,
job_id: Optional[str] = None,
service_account: Optional[str] = None,
network: Optional[str] = None,
reserved_ip_ranges: Optional[List[str]] = None,
) -> None:
"""Rerun a PipelineJob.
Args:
original_pipelinejob_name (str):
Required. The name of the original PipelineJob.
pipeline_task_rerun_configs (List[aiplatform_v1beta1.PipelineTaskRerunConfig]):
Optional. The list of PipelineTaskRerunConfig to specify the tasks to rerun.
parameter_values (Dict[str, Any]):
Optional. The parameter values to override the original PipelineJob.
job_id (str):
Optional. The ID to use for the PipelineJob, which will become the final
component of the PipelineJob name. If not provided, an ID will be
automatically generated.
service_account (str):
Optional. Specifies the service account for workload run-as account.
Users submitting jobs must have act-as permission on this run-as account.
network (str):
Optional. The full name of the Compute Engine network to which the job
should be peered. For example, projects/12345/global/networks/myVPC.
Private services access must already be configured for the network.
If left unspecified, the network set in aiplatform.init will be used.
Otherwise, the job is not peered with any network.
reserved_ip_ranges (List[str]):
Optional. A list of names for the reserved IP ranges under the VPC
network that can be used for this PipelineJob's workload. For example: ['vertex-ai-ip-range'].
If left unspecified, the job will be deployed to any IP ranges under
the provided VPC network.
"""
network = network or initializer.global_config.network
service_account = service_account or initializer.global_config.service_account
gca_resouce = self._v1_beta1_pipeline_job
if service_account:
gca_resouce.service_account = service_account
if network:
gca_resouce.network = network
if reserved_ip_ranges:
gca_resouce.reserved_ip_ranges = reserved_ip_ranges
user_project = initializer.global_config.project
user_location = initializer.global_config.location
parent = initializer.global_config.common_location_path(
project=user_project, location=user_location
)
client = self._instantiate_client(
location=user_location,
appended_user_agent=["preview-pipeline-job-submit"],
)
v1beta1_client = client.select_version(compat.V1BETA1)
_LOGGER.log_create_with_lro(self.__class__)
pipeline_job = self._v1_beta1_pipeline_job
try:
get_request = aiplatform_v1beta1.GetPipelineJobRequest(
name=original_pipelinejob_name
)
original_pipeline_job = v1beta1_client.get_pipeline_job(request=get_request)
pipeline_job.original_pipeline_job_id = int(
original_pipeline_job.labels["vertex-ai-pipelines-run-billing-id"]
)
except Exception as e:
raise ValueError(
f"Failed to get original pipeline job: {original_pipelinejob_name}"
) from e
pipeline_job.pipeline_task_rerun_configs = pipeline_task_rerun_configs
if parameter_values:
runtime_config = self._v1_beta1_pipeline_job.runtime_config
runtime_config.parameter_values = parameter_values
pipeline_name = self._v1_beta1_pipeline_job.display_name
job_id = job_id or "{pipeline_name}-{timestamp}".format(
pipeline_name=re.sub("[^-0-9a-z]+", "-", pipeline_name.lower())
.lstrip("-")
.rstrip("-"),
timestamp=_get_current_time().strftime("%Y%m%d%H%M%S"),
)
request = aiplatform_v1beta1.CreatePipelineJobRequest(
parent=parent,
pipeline_job=self._v1_beta1_pipeline_job,
pipeline_job_id=job_id,
)
response = v1beta1_client.create_pipeline_job(request=request)
self._gca_resource = response
_LOGGER.log_create_complete_with_getter(
self.__class__, self._gca_resource, "pipeline_job"
)
_LOGGER.info("View Pipeline Job:\n%s" % self._dashboard_uri())

View File

@@ -0,0 +1,237 @@
# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List, Optional
from google.auth import credentials as auth_credentials
from google.cloud.aiplatform import (
PipelineJob,
)
from google.cloud.aiplatform.pipeline_job_schedules import (
PipelineJobSchedule as PipelineJobScheduleGa,
)
from google.cloud.aiplatform.preview.schedule.schedules import (
_Schedule as _SchedulePreview,
)
class PipelineJobSchedule(
PipelineJobScheduleGa,
_SchedulePreview,
):
def __init__(
self,
pipeline_job: PipelineJob,
display_name: str,
credentials: Optional[auth_credentials.Credentials] = None,
project: Optional[str] = None,
location: Optional[str] = None,
):
"""Retrieves a PipelineJobSchedule resource and instantiates its
representation.
Args:
pipeline_job (PipelineJob):
Required. PipelineJob used to init the schedule.
display_name (str):
Required. The user-defined name of this PipelineJobSchedule.
credentials (auth_credentials.Credentials):
Optional. Custom credentials to use to create this PipelineJobSchedule.
Overrides credentials set in aiplatform.init.
project (str):
Optional. The project that you want to run this PipelineJobSchedule in.
If not set, the project set in aiplatform.init will be used.
location (str):
Optional. Location to create PipelineJobSchedule. If not set,
location set in aiplatform.init will be used.
"""
super().__init__(
pipeline_job=pipeline_job,
display_name=display_name,
credentials=credentials,
project=project,
location=location,
)
def create(
self,
cron_expression: str,
start_time: Optional[str] = None,
end_time: Optional[str] = None,
allow_queueing: bool = False,
max_run_count: Optional[int] = None,
max_concurrent_run_count: int = 1,
service_account: Optional[str] = None,
network: Optional[str] = None,
create_request_timeout: Optional[float] = None,
) -> None:
"""Create a PipelineJobSchedule.
Args:
cron_expression (str):
Required. Time specification (cron schedule expression) to launch scheduled runs.
To explicitly set a timezone to the cron tab, apply a prefix: "CRON_TZ=${IANA_TIME_ZONE}" or "TZ=${IANA_TIME_ZONE}".
The ${IANA_TIME_ZONE} may only be a valid string from IANA time zone database.
For example, "CRON_TZ=America/New_York 1 * * * *", or "TZ=America/New_York 1 * * * *".
start_time (str):
Optional. Timestamp after which the first run can be scheduled.
If unspecified, it defaults to the schedule creation timestamp.
end_time (str):
Optional. Timestamp after which no more runs will be scheduled.
If unspecified, then runs will be scheduled indefinitely.
allow_queueing (bool):
Optional. Whether new scheduled runs can be queued when max_concurrent_runs limit is reached.
max_run_count (int):
Optional. Maximum run count of the schedule.
If specified, The schedule will be completed when either started_run_count >= max_run_count or when end_time is reached.
Must be positive and <= 2^63-1.
max_concurrent_run_count (int):
Optional. Maximum number of runs that can be started concurrently for this PipelineJobSchedule.
service_account (str):
Optional. Specifies the service account for workload run-as account.
Users submitting jobs must have act-as permission on this run-as account.
network (str):
Optional. The full name of the Compute Engine network to which the job
should be peered. For example, projects/12345/global/networks/myVPC.
Private services access must already be configured for the network.
If left unspecified, the network set in aiplatform.init will be used.
Otherwise, the job is not peered with any network.
create_request_timeout (float):
Optional. The timeout for the create request in seconds.
"""
super().create(
cron=cron_expression,
start_time=start_time,
end_time=end_time,
allow_queueing=allow_queueing,
max_run_count=max_run_count,
max_concurrent_run_count=max_concurrent_run_count,
service_account=service_account,
network=network,
create_request_timeout=create_request_timeout,
)
def list_jobs(
self,
filter: Optional[str] = None,
order_by: Optional[str] = None,
enable_simple_view: bool = False,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
) -> List[PipelineJob]:
"""List all PipelineJob 's created by this PipelineJobSchedule.
Example usage:
pipeline_job_schedule.list_jobs(order_by='create_time_desc')
Args:
filter (str):
Optional. An expression for filtering the results of the request.
For field names both snake_case and camelCase are supported.
order_by (str):
Optional. A comma-separated list of fields to order by, sorted in
ascending order. Use "desc" after a field name for descending.
Supported fields: `display_name`, `create_time`, `update_time`
enable_simple_view (bool):
Optional. Whether to pass the `read_mask` parameter to the list call.
Defaults to False if not provided. This will improve the performance of calling
list(). However, the returned PipelineJob list will not include all fields for
each PipelineJob. Setting this to True will exclude the following fields in your
response: `runtime_config`, `service_account`, `network`, and some subfields of
`pipeline_spec` and `job_detail`. The following fields will be included in
each PipelineJob resource in your response: `state`, `display_name`,
`pipeline_spec.pipeline_info`, `create_time`, `start_time`, `end_time`,
`update_time`, `labels`, `template_uri`, `template_metadata.version`,
`job_detail.pipeline_run_context`, `job_detail.pipeline_context`.
project (str):
Optional. Project to retrieve list from. If not set, project
set in aiplatform.init will be used.
location (str):
Optional. Location to retrieve list from. If not set, location
set in aiplatform.init will be used.
credentials (auth_credentials.Credentials):
Optional. Custom credentials to use to retrieve list. Overrides
credentials set in aiplatform.init.
Returns:
List[PipelineJob] - A list of PipelineJob resource objects.
"""
return super().list_jobs(
filter=filter,
order_by=order_by,
enable_simple_view=enable_simple_view,
project=project,
location=location,
credentials=credentials,
)
def update(
self,
display_name: Optional[str] = None,
cron_expression: Optional[str] = None,
start_time: Optional[str] = None,
end_time: Optional[str] = None,
allow_queueing: Optional[bool] = None,
max_run_count: Optional[int] = None,
max_concurrent_run_count: Optional[int] = None,
) -> None:
"""Update an existing PipelineJobSchedule.
Example usage:
pipeline_job_schedule.update(
display_name='updated-display-name',
cron_expression='* * * * *',
)
Args:
display_name (str):
Optional. The user-defined name of this PipelineJobSchedule.
cron_expression (str):
Optional. Time specification (cron schedule expression) to launch scheduled runs.
To explicitly set a timezone to the cron tab, apply a prefix: "CRON_TZ=${IANA_TIME_ZONE}" or "TZ=${IANA_TIME_ZONE}".
The ${IANA_TIME_ZONE} may only be a valid string from IANA time zone database.
For example, "CRON_TZ=America/New_York 1 * * * *", or "TZ=America/New_York 1 * * * *".
start_time (str):
Optional. Timestamp after which the first run can be scheduled.
If unspecified, it defaults to the schedule creation timestamp.
end_time (str):
Optional. Timestamp after which no more runs will be scheduled.
If unspecified, then runs will be scheduled indefinitely.
allow_queueing (bool):
Optional. Whether new scheduled runs can be queued when max_concurrent_runs limit is reached.
max_run_count (int):
Optional. Maximum run count of the schedule.
If specified, The schedule will be completed when either started_run_count >= max_run_count or when end_time is reached.
Must be positive and <= 2^63-1.
max_concurrent_run_count (int):
Optional. Maximum number of runs that can be started concurrently for this PipelineJobSchedule.
Raises:
RuntimeError: User tried to call update() before create().
"""
super().update(
display_name=display_name,
cron=cron_expression,
start_time=start_time,
end_time=end_time,
allow_queueing=allow_queueing,
max_run_count=max_run_count,
max_concurrent_run_count=max_concurrent_run_count,
)

View File

@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import NamedTuple, Optional, Dict, Union
from google.cloud.aiplatform import utils
from google.cloud.aiplatform.compat.types import (
accelerator_type_v1beta1 as gca_accelerator_type_compat,
)
class _ResourcePool(NamedTuple):
"""Specification container for Worker Pool specs used for distributed training.
Usage:
resource_pool = _ResourcePool(
replica_count=1,
machine_type='n1-standard-4',
accelerator_count=1,
accelerator_type='NVIDIA_TESLA_K80',
boot_disk_type='pd-ssd',
boot_disk_size_gb=100,
)
Note that container and python package specs are not stored with this spec.
"""
replica_count: int = 1
machine_type: str = "n1-standard-4"
accelerator_count: int = 0
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED"
boot_disk_type: str = "pd-ssd"
boot_disk_size_gb: int = 100
def _get_accelerator_type(self) -> Optional[str]:
"""Validates accelerator_type and returns the name of the accelerator.
Returns:
None if no accelerator or valid accelerator name.
Raise:
ValueError if accelerator type is invalid.
"""
# Raises ValueError if invalid accelerator_type
utils.validate_accelerator_type(self.accelerator_type)
accelerator_enum = getattr(
gca_accelerator_type_compat.AcceleratorType, self.accelerator_type
)
if (
accelerator_enum
!= gca_accelerator_type_compat.AcceleratorType.ACCELERATOR_TYPE_UNSPECIFIED
):
return self.accelerator_type
@property
def spec_dict(self) -> Dict[str, Union[int, str, Dict[str, Union[int, str]]]]:
"""Return specification as a Dict."""
spec = {
"machine_spec": {"machine_type": self.machine_type},
"replica_count": self.replica_count,
"disk_spec": {
"boot_disk_type": self.boot_disk_type,
"boot_disk_size_gb": self.boot_disk_size_gb,
},
}
accelerator_type = self._get_accelerator_type()
if accelerator_type and self.accelerator_count:
spec["machine_spec"]["accelerator_type"] = accelerator_type
spec["machine_spec"]["accelerator_count"] = self.accelerator_count
return spec
@property
def is_empty(self) -> bool:
"""Returns True is replica_count > 0 False otherwise."""
return self.replica_count <= 0

View File

@@ -0,0 +1,55 @@
# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from google.auth import credentials as auth_credentials
from google.cloud.aiplatform.schedules import _Schedule as _ScheduleGa
class _Schedule(
_ScheduleGa,
):
"""Preview Schedule resource for Vertex AI."""
def __init__(
self,
credentials: auth_credentials.Credentials,
project: str,
location: str,
):
"""Retrieves a Schedule resource and instantiates its representation.
Args:
credentials (auth_credentials.Credentials):
Optional. Custom credentials to use to create this Schedule.
Overrides credentials set in aiplatform.init.
project (str):
Optional. The project that you want to run this Schedule in.
If not set, the project set in aiplatform.init will be used.
location (str):
Optional. Location to create Schedule. If not set,
location set in aiplatform.init will be used.
"""
super().__init__(project=project, location=location, credentials=credentials)
@property
def cron_expression(self) -> str:
"""Current Schedule cron expression.
Returns:
Schedule cron expression.
"""
return super().cron

View File

@@ -0,0 +1,64 @@
"""Ray on Vertex AI."""
# -*- coding: utf-8 -*-
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import sys
from google.cloud.aiplatform.vertex_ray.bigquery_datasource import (
_BigQueryDatasource,
)
from google.cloud.aiplatform.vertex_ray.client_builder import (
VertexRayClientBuilder as ClientBuilder,
)
from google.cloud.aiplatform.vertex_ray.cluster_init import (
create_ray_cluster,
delete_ray_cluster,
get_ray_cluster,
list_ray_clusters,
update_ray_cluster,
)
from google.cloud.aiplatform.vertex_ray import data
from google.cloud.aiplatform.vertex_ray.util.resources import (
Resources,
NodeImages,
)
from google.cloud.aiplatform.vertex_ray.dashboard_sdk import (
get_job_submission_client_cluster_info,
)
if sys.version_info[1] not in (10, 11):
print(
"[Ray on Vertex]: The client environment with Python version 3.10 or 3.11 is required."
)
__all__ = (
"_BigQueryDatasource",
"data",
"ClientBuilder",
"get_job_submission_client_cluster_info",
"create_ray_cluster",
"delete_ray_cluster",
"get_ray_cluster",
"list_ray_clusters",
"update_ray_cluster",
"Resources",
"NodeImages",
)

View File

@@ -0,0 +1,18 @@
"""Ray on Vertex AI Prediction."""
# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

View File

@@ -0,0 +1,24 @@
"""Ray on Vertex AI Prediction Tensorflow."""
# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from google.cloud.aiplatform.vertex_ray.predict.sklearn import (
register_sklearn,
)
__all__ = ("register_sklearn",)

View File

@@ -0,0 +1,24 @@
"""Ray on Vertex AI Prediction Tensorflow."""
# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from google.cloud.aiplatform.vertex_ray.predict.tensorflow import (
register_tensorflow,
)
__all__ = ("register_tensorflow",)

View File

@@ -0,0 +1,24 @@
"""Ray on Vertex AI Prediction Tensorflow."""
# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from google.cloud.aiplatform.vertex_ray.predict.torch import (
get_pytorch_model_from,
)
__all__ = ("get_pytorch_model_from",)

View File

@@ -0,0 +1,24 @@
"""Ray on Vertex AI Prediction Tensorflow."""
# -*- coding: utf-8 -*-
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from google.cloud.aiplatform.vertex_ray.predict.xgboost import (
register_xgboost,
)
__all__ = ("register_xgboost",)