3652 lines
170 KiB
Python
3652 lines
170 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2023 Google LLC
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
from typing import Iterable, Optional, Union, Sequence, Dict, List, Tuple
|
|
|
|
import abc
|
|
import copy
|
|
import datetime
|
|
import time
|
|
import tempfile
|
|
|
|
from google.auth import credentials as auth_credentials
|
|
from google.api_core import exceptions as api_exceptions
|
|
from google.protobuf import duration_pb2 # type: ignore
|
|
from google.protobuf import field_mask_pb2 # type: ignore
|
|
from google.rpc import status_pb2
|
|
|
|
from google.cloud import aiplatform
|
|
from google.cloud.aiplatform import base
|
|
from google.cloud.aiplatform.compat.types import (
|
|
batch_prediction_job as gca_bp_job_compat,
|
|
completion_stats as gca_completion_stats,
|
|
custom_job as gca_custom_job_compat,
|
|
execution as gca_execution_compat,
|
|
explanation as gca_explanation_compat,
|
|
encryption_spec as gca_encryption_spec_compat,
|
|
io as gca_io_compat,
|
|
job_state as gca_job_state,
|
|
hyperparameter_tuning_job as gca_hyperparameter_tuning_job_compat,
|
|
study as gca_study_compat,
|
|
model_deployment_monitoring_job as gca_model_deployment_monitoring_job_compat,
|
|
job_state_v1beta1 as gca_job_state_v1beta1,
|
|
model_monitoring_v1beta1 as gca_model_monitoring_v1beta1,
|
|
) # TODO(b/242108750): remove temporary logic once model monitoring for batch prediction is GA
|
|
|
|
from google.cloud.aiplatform.constants import base as constants
|
|
from google.cloud.aiplatform.metadata import constants as metadata_constants
|
|
from google.cloud.aiplatform import initializer
|
|
from google.cloud.aiplatform import hyperparameter_tuning
|
|
from google.cloud.aiplatform import model_monitoring
|
|
from google.cloud.aiplatform import utils
|
|
from google.cloud.aiplatform import _publisher_models
|
|
from google.cloud.aiplatform.utils import console_utils
|
|
from google.cloud.aiplatform.utils import source_utils
|
|
from google.cloud.aiplatform.utils import worker_spec_utils
|
|
|
|
from google.cloud.aiplatform_v1.types import (
|
|
batch_prediction_job as batch_prediction_job_v1,
|
|
)
|
|
from google.cloud.aiplatform_v1.types import custom_job as custom_job_v1
|
|
|
|
_LOGGER = base.Logger(__name__)
|
|
|
|
# TODO(b/242108750): remove temporary logic once model monitoring for batch prediction is GA
|
|
_JOB_COMPLETE_STATES = (
|
|
gca_job_state.JobState.JOB_STATE_SUCCEEDED,
|
|
gca_job_state.JobState.JOB_STATE_FAILED,
|
|
gca_job_state.JobState.JOB_STATE_CANCELLED,
|
|
gca_job_state.JobState.JOB_STATE_PAUSED,
|
|
gca_job_state_v1beta1.JobState.JOB_STATE_SUCCEEDED,
|
|
gca_job_state_v1beta1.JobState.JOB_STATE_FAILED,
|
|
gca_job_state_v1beta1.JobState.JOB_STATE_CANCELLED,
|
|
gca_job_state_v1beta1.JobState.JOB_STATE_PAUSED,
|
|
)
|
|
|
|
_JOB_ERROR_STATES = (
|
|
gca_job_state.JobState.JOB_STATE_FAILED,
|
|
gca_job_state.JobState.JOB_STATE_CANCELLED,
|
|
gca_job_state_v1beta1.JobState.JOB_STATE_FAILED,
|
|
gca_job_state_v1beta1.JobState.JOB_STATE_CANCELLED,
|
|
)
|
|
|
|
_JOB_PENDING_STATES = (
|
|
gca_job_state.JobState.JOB_STATE_QUEUED,
|
|
gca_job_state.JobState.JOB_STATE_PENDING,
|
|
gca_job_state.JobState.JOB_STATE_RUNNING,
|
|
gca_job_state.JobState.JOB_STATE_CANCELLING,
|
|
gca_job_state.JobState.JOB_STATE_UPDATING,
|
|
gca_job_state_v1beta1.JobState.JOB_STATE_QUEUED,
|
|
gca_job_state_v1beta1.JobState.JOB_STATE_PENDING,
|
|
gca_job_state_v1beta1.JobState.JOB_STATE_RUNNING,
|
|
gca_job_state_v1beta1.JobState.JOB_STATE_CANCELLING,
|
|
gca_job_state_v1beta1.JobState.JOB_STATE_UPDATING,
|
|
)
|
|
|
|
# _block_until_complete wait times
|
|
_JOB_WAIT_TIME = 5 # start at five seconds
|
|
_LOG_WAIT_TIME = 5
|
|
_MAX_WAIT_TIME = 60 * 5 # 5 minute wait
|
|
_WAIT_TIME_MULTIPLIER = 2 # scale wait by 2 every iteration
|
|
|
|
|
|
class _Job(base.VertexAiStatefulResource):
|
|
"""Class that represents a general Job resource in Vertex AI.
|
|
Cannot be directly instantiated.
|
|
|
|
Serves as base class to specific Job types, i.e. BatchPredictionJob or
|
|
DataLabelingJob to re-use shared functionality.
|
|
|
|
Subclasses requires one class attribute:
|
|
|
|
_getter_method (str): The name of JobServiceClient getter method for specific
|
|
Job type, i.e. 'get_custom_job' for CustomJob
|
|
_cancel_method (str): The name of the specific JobServiceClient cancel method
|
|
_delete_method (str): The name of the specific JobServiceClient delete method
|
|
"""
|
|
|
|
client_class = utils.JobClientWithOverride
|
|
|
|
# Required by the done() method
|
|
_valid_done_states = _JOB_COMPLETE_STATES
|
|
|
|
def __init__(
|
|
self,
|
|
job_name: str,
|
|
project: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
):
|
|
"""Retrieves Job subclass resource by calling a subclass-specific getter
|
|
method.
|
|
|
|
Args:
|
|
job_name (str):
|
|
Required. A fully-qualified job resource name or job ID.
|
|
Example: "projects/123/locations/us-central1/batchPredictionJobs/456" or
|
|
"456" when project, location and job_type are initialized or passed.
|
|
project: Optional[str] = None,
|
|
Optional. project to retrieve Job subclass from. If not set,
|
|
project set in aiplatform.init will be used.
|
|
location: Optional[str] = None,
|
|
Optional. location to retrieve Job subclass from. If not set,
|
|
location set in aiplatform.init will be used.
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
Custom credentials to use. If not set, credentials set in
|
|
aiplatform.init will be used.
|
|
"""
|
|
super().__init__(
|
|
project=project,
|
|
location=location,
|
|
credentials=credentials,
|
|
resource_name=job_name,
|
|
)
|
|
self._gca_resource = self._get_gca_resource(resource_name=job_name)
|
|
|
|
@property
|
|
def state(self) -> gca_job_state.JobState:
|
|
"""Fetch Job again and return the current JobState.
|
|
|
|
Returns:
|
|
state (job_state.JobState):
|
|
Enum that describes the state of a Vertex AI job.
|
|
"""
|
|
|
|
# Fetch the Job again for most up-to-date job state
|
|
self._sync_gca_resource()
|
|
|
|
return self._gca_resource.state
|
|
|
|
@property
|
|
def start_time(self) -> Optional[datetime.datetime]:
|
|
"""Time when the Job resource entered the `JOB_STATE_RUNNING` for the
|
|
first time."""
|
|
self._sync_gca_resource()
|
|
return getattr(self._gca_resource, "start_time")
|
|
|
|
@property
|
|
def end_time(self) -> Optional[datetime.datetime]:
|
|
"""Time when the Job resource entered the `JOB_STATE_SUCCEEDED`,
|
|
`JOB_STATE_FAILED`, or `JOB_STATE_CANCELLED` state."""
|
|
self._sync_gca_resource()
|
|
return getattr(self._gca_resource, "end_time")
|
|
|
|
@property
|
|
def error(self) -> Optional[status_pb2.Status]:
|
|
"""Detailed error info for this Job resource. Only populated when the
|
|
Job's state is `JOB_STATE_FAILED` or `JOB_STATE_CANCELLED`."""
|
|
self._sync_gca_resource()
|
|
return getattr(self._gca_resource, "error")
|
|
|
|
@property
|
|
@abc.abstractmethod
|
|
def _job_type(cls) -> str:
|
|
"""Job type."""
|
|
pass
|
|
|
|
@property
|
|
@abc.abstractmethod
|
|
def _cancel_method(cls) -> str:
|
|
"""Name of cancellation method for cancelling the specific job type."""
|
|
pass
|
|
|
|
def _dashboard_uri(self) -> Optional[str]:
|
|
"""Helper method to compose the dashboard uri where job can be
|
|
viewed."""
|
|
fields = self._parse_resource_name(self.resource_name)
|
|
location = fields.pop("location")
|
|
project = fields.pop("project")
|
|
job = list(fields.values())[0]
|
|
url = f"https://console.cloud.google.com/ai/platform/locations/{location}/{self._job_type}/{job}?project={project}"
|
|
return url
|
|
|
|
def _log_job_state(self):
|
|
"""Helper method to log job state."""
|
|
_LOGGER.info(
|
|
"%s %s current state:\n%s"
|
|
% (
|
|
self.__class__.__name__,
|
|
self._gca_resource.name,
|
|
self._gca_resource.state,
|
|
)
|
|
)
|
|
|
|
def _block_until_complete(self):
|
|
"""Helper method to block and check on job until complete.
|
|
|
|
Raises:
|
|
RuntimeError: If job failed or cancelled.
|
|
"""
|
|
|
|
log_wait = _LOG_WAIT_TIME
|
|
|
|
previous_time = time.time()
|
|
while self.state not in _JOB_COMPLETE_STATES:
|
|
current_time = time.time()
|
|
if current_time - previous_time >= log_wait:
|
|
self._log_job_state()
|
|
log_wait = min(log_wait * _WAIT_TIME_MULTIPLIER, _MAX_WAIT_TIME)
|
|
previous_time = current_time
|
|
time.sleep(_JOB_WAIT_TIME)
|
|
|
|
self._log_job_state()
|
|
|
|
# Error is only populated when the job state is
|
|
# JOB_STATE_FAILED or JOB_STATE_CANCELLED.
|
|
if self._gca_resource.state in _JOB_ERROR_STATES:
|
|
raise RuntimeError("Job failed with:\n%s" % self._gca_resource.error)
|
|
else:
|
|
_LOGGER.log_action_completed_against_resource("run", "completed", self)
|
|
|
|
def wait_for_completion(self) -> None:
|
|
"""Waits for job to complete.
|
|
|
|
Raises:
|
|
RuntimeError: If job failed or cancelled.
|
|
"""
|
|
self._block_until_complete()
|
|
|
|
@classmethod
|
|
def list(
|
|
cls,
|
|
filter: Optional[str] = None,
|
|
order_by: Optional[str] = None,
|
|
project: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
) -> List[base.VertexAiResourceNoun]:
|
|
"""List all instances of this Job Resource.
|
|
|
|
Example Usage:
|
|
|
|
aiplatform.BatchPredictionJobs.list(
|
|
filter='state="JOB_STATE_SUCCEEDED" AND display_name="my_job"',
|
|
)
|
|
|
|
Args:
|
|
filter (str):
|
|
Optional. An expression for filtering the results of the request.
|
|
For field names both snake_case and camelCase are supported.
|
|
order_by (str):
|
|
Optional. A comma-separated list of fields to order by, sorted in
|
|
ascending order. Use "desc" after a field name for descending.
|
|
Supported fields: `display_name`, `create_time`, `update_time`
|
|
project (str):
|
|
Optional. Project to retrieve list from. If not set, project
|
|
set in aiplatform.init will be used.
|
|
location (str):
|
|
Optional. Location to retrieve list from. If not set, location
|
|
set in aiplatform.init will be used.
|
|
credentials (auth_credentials.Credentials):
|
|
Optional. Custom credentials to use to retrieve list. Overrides
|
|
credentials set in aiplatform.init.
|
|
|
|
Returns:
|
|
List[VertexAiResourceNoun] - A list of Job resource objects.
|
|
"""
|
|
|
|
return cls._list_with_local_order(
|
|
filter=filter,
|
|
order_by=order_by,
|
|
project=project,
|
|
location=location,
|
|
credentials=credentials,
|
|
)
|
|
|
|
def cancel(self) -> None:
|
|
"""Cancels this Job.
|
|
|
|
Success of cancellation is not guaranteed. Use `Job.state`
|
|
property to verify if cancellation was successful.
|
|
"""
|
|
|
|
_LOGGER.log_action_start_against_resource("Cancelling", "run", self)
|
|
getattr(self.api_client, self._cancel_method)(name=self.resource_name)
|
|
|
|
|
|
class BatchPredictionJob(_Job):
|
|
|
|
_resource_noun = "batchPredictionJobs"
|
|
_getter_method = "get_batch_prediction_job"
|
|
_list_method = "list_batch_prediction_jobs"
|
|
_cancel_method = "cancel_batch_prediction_job"
|
|
_delete_method = "delete_batch_prediction_job"
|
|
_job_type = "batch-predictions"
|
|
_parse_resource_name_method = "parse_batch_prediction_job_path"
|
|
_format_resource_name_method = "batch_prediction_job_path"
|
|
|
|
def __init__(
|
|
self,
|
|
batch_prediction_job_name: str,
|
|
project: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
):
|
|
"""Retrieves a BatchPredictionJob resource and instantiates its
|
|
representation.
|
|
|
|
Args:
|
|
batch_prediction_job_name (str):
|
|
Required. A fully-qualified BatchPredictionJob resource name or ID.
|
|
Example: "projects/.../locations/.../batchPredictionJobs/456" or
|
|
"456" when project and location are initialized or passed.
|
|
project: Optional[str] = None,
|
|
Optional. project to retrieve BatchPredictionJob from. If not set,
|
|
project set in aiplatform.init will be used.
|
|
location: Optional[str] = None,
|
|
Optional. location to retrieve BatchPredictionJob from. If not set,
|
|
location set in aiplatform.init will be used.
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
Custom credentials to use. If not set, credentials set in
|
|
aiplatform.init will be used.
|
|
"""
|
|
|
|
super().__init__(
|
|
job_name=batch_prediction_job_name,
|
|
project=project,
|
|
location=location,
|
|
credentials=credentials,
|
|
)
|
|
|
|
@property
|
|
def output_info(
|
|
self,
|
|
) -> Optional[batch_prediction_job_v1.BatchPredictionJob.OutputInfo]:
|
|
"""Information describing the output of this job, including output location
|
|
into which prediction output is written.
|
|
|
|
This is only available for batch prediction jobs that have run successfully.
|
|
"""
|
|
self._assert_gca_resource_is_available()
|
|
return self._gca_resource.output_info
|
|
|
|
@property
|
|
def partial_failures(self) -> Optional[Sequence[status_pb2.Status]]:
|
|
"""Partial failures encountered. For example, single files that can't be read.
|
|
This field never exceeds 20 entries. Status details fields contain standard
|
|
GCP error details."""
|
|
self._assert_gca_resource_is_available()
|
|
return getattr(self._gca_resource, "partial_failures")
|
|
|
|
@property
|
|
def completion_stats(self) -> Optional[gca_completion_stats.CompletionStats]:
|
|
"""Statistics on completed and failed prediction instances."""
|
|
self._assert_gca_resource_is_available()
|
|
return getattr(self._gca_resource, "completion_stats")
|
|
|
|
@classmethod
|
|
def create(
|
|
cls,
|
|
# TODO(b/223262536): Make the job_display_name parameter optional in the next major release
|
|
job_display_name: str,
|
|
model_name: Union[str, "aiplatform.Model"],
|
|
instances_format: str = "jsonl",
|
|
predictions_format: str = "jsonl",
|
|
gcs_source: Optional[Union[str, Sequence[str]]] = None,
|
|
bigquery_source: Optional[str] = None,
|
|
gcs_destination_prefix: Optional[str] = None,
|
|
bigquery_destination_prefix: Optional[str] = None,
|
|
model_parameters: Optional[Dict] = None,
|
|
machine_type: Optional[str] = None,
|
|
accelerator_type: Optional[str] = None,
|
|
accelerator_count: Optional[int] = None,
|
|
starting_replica_count: Optional[int] = None,
|
|
max_replica_count: Optional[int] = None,
|
|
generate_explanation: Optional[bool] = False,
|
|
explanation_metadata: Optional["aiplatform.explain.ExplanationMetadata"] = None,
|
|
explanation_parameters: Optional[
|
|
"aiplatform.explain.ExplanationParameters"
|
|
] = None,
|
|
labels: Optional[Dict[str, str]] = None,
|
|
project: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
encryption_spec_key_name: Optional[str] = None,
|
|
sync: bool = True,
|
|
create_request_timeout: Optional[float] = None,
|
|
batch_size: Optional[int] = None,
|
|
model_monitoring_objective_config: Optional[
|
|
"aiplatform.model_monitoring.ObjectiveConfig"
|
|
] = None,
|
|
model_monitoring_alert_config: Optional[
|
|
"aiplatform.model_monitoring.AlertConfig"
|
|
] = None,
|
|
analysis_instance_schema_uri: Optional[str] = None,
|
|
service_account: Optional[str] = None,
|
|
) -> "BatchPredictionJob":
|
|
"""Create a batch prediction job.
|
|
|
|
Args:
|
|
job_display_name (str):
|
|
Required. The user-defined name of the BatchPredictionJob.
|
|
The name can be up to 128 characters long and can be consist
|
|
of any UTF-8 characters.
|
|
model_name (Union[str, aiplatform.Model]):
|
|
Required. A fully-qualified model resource name or model ID.
|
|
Example: "projects/123/locations/us-central1/models/456" or
|
|
"456" when project and location are initialized or passed.
|
|
May optionally contain a version ID or alias in
|
|
{model_name}@{version} form.
|
|
|
|
Or an instance of aiplatform.Model.
|
|
instances_format (str):
|
|
Required. The format in which instances are provided. Must be one
|
|
of the formats listed in `Model.supported_input_storage_formats`.
|
|
Default is "jsonl" when using `gcs_source`. If a `bigquery_source`
|
|
is provided, this is overridden to "bigquery".
|
|
predictions_format (str):
|
|
Required. The format in which Vertex AI outputs the
|
|
predictions, must be one of the formats specified in
|
|
`Model.supported_output_storage_formats`.
|
|
Default is "jsonl" when using `gcs_destination_prefix`. If a
|
|
`bigquery_destination_prefix` is provided, this is overridden to
|
|
"bigquery".
|
|
gcs_source (Optional[Sequence[str]]):
|
|
Google Cloud Storage URI(-s) to your instances to run
|
|
batch prediction on. They must match `instances_format`.
|
|
|
|
bigquery_source (Optional[str]):
|
|
BigQuery URI to a table, up to 2000 characters long. For example:
|
|
`bq://projectId.bqDatasetId.bqTableId`
|
|
gcs_destination_prefix (Optional[str]):
|
|
The Google Cloud Storage location of the directory where the
|
|
output is to be written to. In the given directory a new
|
|
directory is created. Its name is
|
|
``prediction-<model-display-name>-<job-create-time>``, where
|
|
timestamp is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format.
|
|
Inside of it files ``predictions_0001.<extension>``,
|
|
``predictions_0002.<extension>``, ...,
|
|
``predictions_N.<extension>`` are created where
|
|
``<extension>`` depends on chosen ``predictions_format``,
|
|
and N may equal 0001 and depends on the total number of
|
|
successfully predicted instances. If the Model has both
|
|
``instance`` and ``prediction`` schemata defined then each such
|
|
file contains predictions as per the ``predictions_format``.
|
|
If prediction for any instance failed (partially or
|
|
completely), then an additional ``errors_0001.<extension>``,
|
|
``errors_0002.<extension>``,..., ``errors_N.<extension>``
|
|
files are created (N depends on total number of failed
|
|
predictions). These files contain the failed instances, as
|
|
per their schema, followed by an additional ``error`` field
|
|
which as value has ```google.rpc.Status`` <Status>`__
|
|
containing only ``code`` and ``message`` fields.
|
|
bigquery_destination_prefix (Optional[str]):
|
|
The BigQuery project or dataset location where the output is
|
|
to be written to. If project is provided, a new dataset is
|
|
created with name
|
|
``prediction_<model-display-name>_<job-create-time>`` where
|
|
is made BigQuery-dataset-name compatible (for example, most
|
|
special characters become underscores), and timestamp is in
|
|
YYYY_MM_DDThh_mm_ss_sssZ "based on ISO-8601" format. In the
|
|
dataset two tables will be created, ``predictions``, and
|
|
``errors``. If the Model has both
|
|
[instance][google.cloud.aiplatform.v1.PredictSchemata.instance_schema_uri]
|
|
and
|
|
[prediction][google.cloud.aiplatform.v1.PredictSchemata.parameters_schema_uri]
|
|
schemata defined then the tables have columns as follows:
|
|
The ``predictions`` table contains instances for which the
|
|
prediction succeeded, it has columns as per a concatenation
|
|
of the Model's instance and prediction schemata. The
|
|
``errors`` table contains rows for which the prediction has
|
|
failed, it has instance columns, as per the instance schema,
|
|
followed by a single "errors" column, which as values has
|
|
[google.rpc.Status][google.rpc.Status] represented as a
|
|
STRUCT, and containing only ``code`` and ``message``.
|
|
model_parameters (Optional[Dict]):
|
|
The parameters that govern the predictions. The schema of
|
|
the parameters may be specified via the Model's `parameters_schema_uri`.
|
|
machine_type (Optional[str]):
|
|
The type of machine for running batch prediction on
|
|
dedicated resources. Not specifying machine type will result in
|
|
batch prediction job being run with automatic resources.
|
|
accelerator_type (Optional[str]):
|
|
The type of accelerator(s) that may be attached
|
|
to the machine as per `accelerator_count`. Only used if
|
|
`machine_type` is set.
|
|
accelerator_count (Optional[int]):
|
|
The number of accelerators to attach to the
|
|
`machine_type`. Only used if `machine_type` is set.
|
|
starting_replica_count (Optional[int]):
|
|
The number of machine replicas used at the start of the batch
|
|
operation. If not set, Vertex AI decides starting number, not
|
|
greater than `max_replica_count`. Only used if `machine_type` is
|
|
set.
|
|
max_replica_count (Optional[int]):
|
|
The maximum number of machine replicas the batch operation may
|
|
be scaled to. Only used if `machine_type` is set.
|
|
Default is 10.
|
|
generate_explanation (bool):
|
|
Optional. Generate explanation along with the batch prediction
|
|
results. This will cause the batch prediction output to include
|
|
explanations based on the `prediction_format`:
|
|
- `bigquery`: output includes a column named `explanation`. The value
|
|
is a struct that conforms to the [aiplatform.gapic.Explanation] object.
|
|
- `jsonl`: The JSON objects on each line include an additional entry
|
|
keyed `explanation`. The value of the entry is a JSON object that
|
|
conforms to the [aiplatform.gapic.Explanation] object.
|
|
- `csv`: Generating explanations for CSV format is not supported.
|
|
explanation_metadata (aiplatform.explain.ExplanationMetadata):
|
|
Optional. Explanation metadata configuration for this BatchPredictionJob.
|
|
Can be specified only if `generate_explanation` is set to `True`.
|
|
|
|
This value overrides the value of `Model.explanation_metadata`.
|
|
All fields of `explanation_metadata` are optional in the request. If
|
|
a field of the `explanation_metadata` object is not populated, the
|
|
corresponding field of the `Model.explanation_metadata` object is inherited.
|
|
For more details, see `Ref docs <http://tinyurl.com/1igh60kt>`
|
|
explanation_parameters (aiplatform.explain.ExplanationParameters):
|
|
Optional. Parameters to configure explaining for Model's predictions.
|
|
Can be specified only if `generate_explanation` is set to `True`.
|
|
|
|
This value overrides the value of `Model.explanation_parameters`.
|
|
All fields of `explanation_parameters` are optional in the request. If
|
|
a field of the `explanation_parameters` object is not populated, the
|
|
corresponding field of the `Model.explanation_parameters` object is inherited.
|
|
For more details, see `Ref docs <http://tinyurl.com/1an4zake>`
|
|
labels (Dict[str, str]):
|
|
Optional. The labels with user-defined metadata to organize your
|
|
BatchPredictionJobs. Label keys and values can be no longer than
|
|
64 characters (Unicode codepoints), can only contain lowercase
|
|
letters, numeric characters, underscores and dashes.
|
|
International characters are allowed. See https://goo.gl/xmQnxf
|
|
for more information and examples of labels.
|
|
credentials (Optional[auth_credentials.Credentials]):
|
|
Custom credentials to use to create this batch prediction
|
|
job. Overrides credentials set in aiplatform.init.
|
|
encryption_spec_key_name (Optional[str]):
|
|
Optional. The Cloud KMS resource identifier of the customer
|
|
managed encryption key used to protect the job. Has the
|
|
form:
|
|
``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``.
|
|
The key needs to be in the same region as where the compute
|
|
resource is created.
|
|
|
|
If this is set, then all
|
|
resources created by the BatchPredictionJob will
|
|
be encrypted with the provided encryption key.
|
|
|
|
Overrides encryption_spec_key_name set in aiplatform.init.
|
|
sync (bool):
|
|
Whether to execute this method synchronously. If False, this method
|
|
will be executed in concurrent Future and any downstream object will
|
|
be immediately returned and synced when the Future has completed.
|
|
create_request_timeout (float):
|
|
Optional. The timeout for the create request in seconds.
|
|
batch_size (int):
|
|
Optional. The number of the records (e.g. instances) of the operation given in each batch
|
|
to a machine replica. Machine type, and size of a single record should be considered
|
|
when setting this parameter, higher value speeds up the batch operation's execution,
|
|
but too high value will result in a whole batch not fitting in a machine's memory,
|
|
and the whole operation will fail.
|
|
The default value is 64.
|
|
model_monitoring_objective_config (aiplatform.model_monitoring.ObjectiveConfig):
|
|
Optional. The objective config for model monitoring. Passing this parameter enables
|
|
monitoring on the model associated with this batch prediction job.
|
|
model_monitoring_alert_config (aiplatform.model_monitoring.EmailAlertConfig):
|
|
Optional. Configures how model monitoring alerts are sent to the user. Right now
|
|
only email alert is supported.
|
|
analysis_instance_schema_uri (str):
|
|
Optional. Only applicable if model_monitoring_objective_config is also passed.
|
|
This parameter specifies the YAML schema file uri describing the format of a single
|
|
instance that you want Tensorflow Data Validation (TFDV) to
|
|
analyze. If this field is empty, all the feature data types are
|
|
inferred from predict_instance_schema_uri, meaning that TFDV
|
|
will use the data in the exact format as prediction request/response.
|
|
If there are any data type differences between predict instance
|
|
and TFDV instance, this field can be used to override the schema.
|
|
For models trained with Vertex AI, this field must be set as all the
|
|
fields in predict instance formatted as string.
|
|
service_account (str):
|
|
Optional. Specifies the service account for workload run-as account.
|
|
Users submitting jobs must have act-as permission on this run-as account.
|
|
Returns:
|
|
(jobs.BatchPredictionJob):
|
|
Instantiated representation of the created batch prediction job.
|
|
"""
|
|
return cls._submit_impl(
|
|
job_display_name=job_display_name,
|
|
model_name=model_name,
|
|
instances_format=instances_format,
|
|
predictions_format=predictions_format,
|
|
gcs_source=gcs_source,
|
|
bigquery_source=bigquery_source,
|
|
gcs_destination_prefix=gcs_destination_prefix,
|
|
bigquery_destination_prefix=bigquery_destination_prefix,
|
|
model_parameters=model_parameters,
|
|
machine_type=machine_type,
|
|
accelerator_type=accelerator_type,
|
|
accelerator_count=accelerator_count,
|
|
starting_replica_count=starting_replica_count,
|
|
max_replica_count=max_replica_count,
|
|
generate_explanation=generate_explanation,
|
|
explanation_metadata=explanation_metadata,
|
|
explanation_parameters=explanation_parameters,
|
|
labels=labels,
|
|
project=project,
|
|
location=location,
|
|
credentials=credentials,
|
|
encryption_spec_key_name=encryption_spec_key_name,
|
|
sync=sync,
|
|
create_request_timeout=create_request_timeout,
|
|
batch_size=batch_size,
|
|
model_monitoring_objective_config=model_monitoring_objective_config,
|
|
model_monitoring_alert_config=model_monitoring_alert_config,
|
|
analysis_instance_schema_uri=analysis_instance_schema_uri,
|
|
service_account=service_account,
|
|
# Main distinction of `create` vs `submit`:
|
|
wait_for_completion=True,
|
|
)
|
|
|
|
@classmethod
|
|
def submit(
|
|
cls,
|
|
*,
|
|
job_display_name: Optional[str] = None,
|
|
model_name: Union[str, "aiplatform.Model"],
|
|
instances_format: str = "jsonl",
|
|
predictions_format: str = "jsonl",
|
|
gcs_source: Optional[Union[str, Sequence[str]]] = None,
|
|
bigquery_source: Optional[str] = None,
|
|
gcs_destination_prefix: Optional[str] = None,
|
|
bigquery_destination_prefix: Optional[str] = None,
|
|
model_parameters: Optional[Dict] = None,
|
|
machine_type: Optional[str] = None,
|
|
accelerator_type: Optional[str] = None,
|
|
accelerator_count: Optional[int] = None,
|
|
starting_replica_count: Optional[int] = None,
|
|
max_replica_count: Optional[int] = None,
|
|
generate_explanation: Optional[bool] = False,
|
|
explanation_metadata: Optional["aiplatform.explain.ExplanationMetadata"] = None,
|
|
explanation_parameters: Optional[
|
|
"aiplatform.explain.ExplanationParameters"
|
|
] = None,
|
|
labels: Optional[Dict[str, str]] = None,
|
|
project: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
encryption_spec_key_name: Optional[str] = None,
|
|
create_request_timeout: Optional[float] = None,
|
|
batch_size: Optional[int] = None,
|
|
model_monitoring_objective_config: Optional[
|
|
"aiplatform.model_monitoring.ObjectiveConfig"
|
|
] = None,
|
|
model_monitoring_alert_config: Optional[
|
|
"aiplatform.model_monitoring.AlertConfig"
|
|
] = None,
|
|
analysis_instance_schema_uri: Optional[str] = None,
|
|
service_account: Optional[str] = None,
|
|
) -> "BatchPredictionJob":
|
|
"""Sumbit a batch prediction job (not waiting for completion).
|
|
|
|
Args:
|
|
job_display_name (str):
|
|
Required. The user-defined name of the BatchPredictionJob.
|
|
The name can be up to 128 characters long and can be consist
|
|
of any UTF-8 characters.
|
|
model_name (Union[str, aiplatform.Model]):
|
|
Required. A fully-qualified model resource name or model ID.
|
|
Example: "projects/123/locations/us-central1/models/456" or
|
|
"456" when project and location are initialized or passed.
|
|
May optionally contain a version ID or alias in
|
|
{model_name}@{version} form.
|
|
|
|
Or an instance of aiplatform.Model.
|
|
instances_format (str):
|
|
Required. The format in which instances are provided. Must be one
|
|
of the formats listed in `Model.supported_input_storage_formats`.
|
|
Default is "jsonl" when using `gcs_source`. If a `bigquery_source`
|
|
is provided, this is overridden to "bigquery".
|
|
predictions_format (str):
|
|
Required. The format in which Vertex AI outputs the
|
|
predictions, must be one of the formats specified in
|
|
`Model.supported_output_storage_formats`.
|
|
Default is "jsonl" when using `gcs_destination_prefix`. If a
|
|
`bigquery_destination_prefix` is provided, this is overridden to
|
|
"bigquery".
|
|
gcs_source (Optional[Sequence[str]]):
|
|
Google Cloud Storage URI(-s) to your instances to run
|
|
batch prediction on. They must match `instances_format`.
|
|
|
|
bigquery_source (Optional[str]):
|
|
BigQuery URI to a table, up to 2000 characters long. For example:
|
|
`bq://projectId.bqDatasetId.bqTableId`
|
|
gcs_destination_prefix (Optional[str]):
|
|
The Google Cloud Storage location of the directory where the
|
|
output is to be written to. In the given directory a new
|
|
directory is created. Its name is
|
|
``prediction-<model-display-name>-<job-create-time>``, where
|
|
timestamp is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format.
|
|
Inside of it files ``predictions_0001.<extension>``,
|
|
``predictions_0002.<extension>``, ...,
|
|
``predictions_N.<extension>`` are created where
|
|
``<extension>`` depends on chosen ``predictions_format``,
|
|
and N may equal 0001 and depends on the total number of
|
|
successfully predicted instances. If the Model has both
|
|
``instance`` and ``prediction`` schemata defined then each such
|
|
file contains predictions as per the ``predictions_format``.
|
|
If prediction for any instance failed (partially or
|
|
completely), then an additional ``errors_0001.<extension>``,
|
|
``errors_0002.<extension>``,..., ``errors_N.<extension>``
|
|
files are created (N depends on total number of failed
|
|
predictions). These files contain the failed instances, as
|
|
per their schema, followed by an additional ``error`` field
|
|
which as value has ```google.rpc.Status`` <Status>`__
|
|
containing only ``code`` and ``message`` fields.
|
|
bigquery_destination_prefix (Optional[str]):
|
|
The BigQuery project or dataset location where the output is
|
|
to be written to. If project is provided, a new dataset is
|
|
created with name
|
|
``prediction_<model-display-name>_<job-create-time>`` where
|
|
is made BigQuery-dataset-name compatible (for example, most
|
|
special characters become underscores), and timestamp is in
|
|
YYYY_MM_DDThh_mm_ss_sssZ "based on ISO-8601" format. In the
|
|
dataset two tables will be created, ``predictions``, and
|
|
``errors``. If the Model has both
|
|
[instance][google.cloud.aiplatform.v1.PredictSchemata.instance_schema_uri]
|
|
and
|
|
[prediction][google.cloud.aiplatform.v1.PredictSchemata.parameters_schema_uri]
|
|
schemata defined then the tables have columns as follows:
|
|
The ``predictions`` table contains instances for which the
|
|
prediction succeeded, it has columns as per a concatenation
|
|
of the Model's instance and prediction schemata. The
|
|
``errors`` table contains rows for which the prediction has
|
|
failed, it has instance columns, as per the instance schema,
|
|
followed by a single "errors" column, which as values has
|
|
[google.rpc.Status][google.rpc.Status] represented as a
|
|
STRUCT, and containing only ``code`` and ``message``.
|
|
model_parameters (Optional[Dict]):
|
|
The parameters that govern the predictions. The schema of
|
|
the parameters may be specified via the Model's `parameters_schema_uri`.
|
|
machine_type (Optional[str]):
|
|
The type of machine for running batch prediction on
|
|
dedicated resources. Not specifying machine type will result in
|
|
batch prediction job being run with automatic resources.
|
|
accelerator_type (Optional[str]):
|
|
The type of accelerator(s) that may be attached
|
|
to the machine as per `accelerator_count`. Only used if
|
|
`machine_type` is set.
|
|
accelerator_count (Optional[int]):
|
|
The number of accelerators to attach to the
|
|
`machine_type`. Only used if `machine_type` is set.
|
|
starting_replica_count (Optional[int]):
|
|
The number of machine replicas used at the start of the batch
|
|
operation. If not set, Vertex AI decides starting number, not
|
|
greater than `max_replica_count`. Only used if `machine_type` is
|
|
set.
|
|
max_replica_count (Optional[int]):
|
|
The maximum number of machine replicas the batch operation may
|
|
be scaled to. Only used if `machine_type` is set.
|
|
Default is 10.
|
|
generate_explanation (bool):
|
|
Optional. Generate explanation along with the batch prediction
|
|
results. This will cause the batch prediction output to include
|
|
explanations based on the `prediction_format`:
|
|
- `bigquery`: output includes a column named `explanation`. The value
|
|
is a struct that conforms to the [aiplatform.gapic.Explanation] object.
|
|
- `jsonl`: The JSON objects on each line include an additional entry
|
|
keyed `explanation`. The value of the entry is a JSON object that
|
|
conforms to the [aiplatform.gapic.Explanation] object.
|
|
- `csv`: Generating explanations for CSV format is not supported.
|
|
explanation_metadata (aiplatform.explain.ExplanationMetadata):
|
|
Optional. Explanation metadata configuration for this BatchPredictionJob.
|
|
Can be specified only if `generate_explanation` is set to `True`.
|
|
|
|
This value overrides the value of `Model.explanation_metadata`.
|
|
All fields of `explanation_metadata` are optional in the request. If
|
|
a field of the `explanation_metadata` object is not populated, the
|
|
corresponding field of the `Model.explanation_metadata` object is inherited.
|
|
For more details, see `Ref docs <http://tinyurl.com/1igh60kt>`
|
|
explanation_parameters (aiplatform.explain.ExplanationParameters):
|
|
Optional. Parameters to configure explaining for Model's predictions.
|
|
Can be specified only if `generate_explanation` is set to `True`.
|
|
|
|
This value overrides the value of `Model.explanation_parameters`.
|
|
All fields of `explanation_parameters` are optional in the request. If
|
|
a field of the `explanation_parameters` object is not populated, the
|
|
corresponding field of the `Model.explanation_parameters` object is inherited.
|
|
For more details, see `Ref docs <http://tinyurl.com/1an4zake>`
|
|
labels (Dict[str, str]):
|
|
Optional. The labels with user-defined metadata to organize your
|
|
BatchPredictionJobs. Label keys and values can be no longer than
|
|
64 characters (Unicode codepoints), can only contain lowercase
|
|
letters, numeric characters, underscores and dashes.
|
|
International characters are allowed. See https://goo.gl/xmQnxf
|
|
for more information and examples of labels.
|
|
credentials (Optional[auth_credentials.Credentials]):
|
|
Custom credentials to use to create this batch prediction
|
|
job. Overrides credentials set in aiplatform.init.
|
|
encryption_spec_key_name (Optional[str]):
|
|
Optional. The Cloud KMS resource identifier of the customer
|
|
managed encryption key used to protect the job. Has the
|
|
form:
|
|
``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``.
|
|
The key needs to be in the same region as where the compute
|
|
resource is created.
|
|
|
|
If this is set, then all
|
|
resources created by the BatchPredictionJob will
|
|
be encrypted with the provided encryption key.
|
|
|
|
Overrides encryption_spec_key_name set in aiplatform.init.
|
|
create_request_timeout (float):
|
|
Optional. The timeout for the create request in seconds.
|
|
batch_size (int):
|
|
Optional. The number of the records (e.g. instances) of the operation given in each batch
|
|
to a machine replica. Machine type, and size of a single record should be considered
|
|
when setting this parameter, higher value speeds up the batch operation's execution,
|
|
but too high value will result in a whole batch not fitting in a machine's memory,
|
|
and the whole operation will fail.
|
|
The default value is 64.
|
|
model_monitoring_objective_config (aiplatform.model_monitoring.ObjectiveConfig):
|
|
Optional. The objective config for model monitoring. Passing this parameter enables
|
|
monitoring on the model associated with this batch prediction job.
|
|
model_monitoring_alert_config (aiplatform.model_monitoring.EmailAlertConfig):
|
|
Optional. Configures how model monitoring alerts are sent to the user. Right now
|
|
only email alert is supported.
|
|
analysis_instance_schema_uri (str):
|
|
Optional. Only applicable if model_monitoring_objective_config is also passed.
|
|
This parameter specifies the YAML schema file uri describing the format of a single
|
|
instance that you want Tensorflow Data Validation (TFDV) to
|
|
analyze. If this field is empty, all the feature data types are
|
|
inferred from predict_instance_schema_uri, meaning that TFDV
|
|
will use the data in the exact format as prediction request/response.
|
|
If there are any data type differences between predict instance
|
|
and TFDV instance, this field can be used to override the schema.
|
|
For models trained with Vertex AI, this field must be set as all the
|
|
fields in predict instance formatted as string.
|
|
service_account (str):
|
|
Optional. Specifies the service account for workload run-as account.
|
|
Users submitting jobs must have act-as permission on this run-as account.
|
|
Returns:
|
|
(jobs.BatchPredictionJob):
|
|
Instantiated representation of the created batch prediction job.
|
|
"""
|
|
return cls._submit_impl(
|
|
job_display_name=job_display_name,
|
|
model_name=model_name,
|
|
instances_format=instances_format,
|
|
predictions_format=predictions_format,
|
|
gcs_source=gcs_source,
|
|
bigquery_source=bigquery_source,
|
|
gcs_destination_prefix=gcs_destination_prefix,
|
|
bigquery_destination_prefix=bigquery_destination_prefix,
|
|
model_parameters=model_parameters,
|
|
machine_type=machine_type,
|
|
accelerator_type=accelerator_type,
|
|
accelerator_count=accelerator_count,
|
|
starting_replica_count=starting_replica_count,
|
|
max_replica_count=max_replica_count,
|
|
generate_explanation=generate_explanation,
|
|
explanation_metadata=explanation_metadata,
|
|
explanation_parameters=explanation_parameters,
|
|
labels=labels,
|
|
project=project,
|
|
location=location,
|
|
credentials=credentials,
|
|
encryption_spec_key_name=encryption_spec_key_name,
|
|
create_request_timeout=create_request_timeout,
|
|
batch_size=batch_size,
|
|
model_monitoring_objective_config=model_monitoring_objective_config,
|
|
model_monitoring_alert_config=model_monitoring_alert_config,
|
|
analysis_instance_schema_uri=analysis_instance_schema_uri,
|
|
service_account=service_account,
|
|
# Main distinction of `create` vs `submit`:
|
|
wait_for_completion=False,
|
|
sync=True,
|
|
)
|
|
|
|
@classmethod
|
|
def _submit_impl(
|
|
cls,
|
|
*,
|
|
job_display_name: Optional[str] = None,
|
|
model_name: Union[str, "aiplatform.Model"],
|
|
instances_format: str = "jsonl",
|
|
predictions_format: str = "jsonl",
|
|
gcs_source: Optional[Union[str, Sequence[str]]] = None,
|
|
bigquery_source: Optional[str] = None,
|
|
gcs_destination_prefix: Optional[str] = None,
|
|
bigquery_destination_prefix: Optional[str] = None,
|
|
model_parameters: Optional[Dict] = None,
|
|
machine_type: Optional[str] = None,
|
|
accelerator_type: Optional[str] = None,
|
|
accelerator_count: Optional[int] = None,
|
|
starting_replica_count: Optional[int] = None,
|
|
max_replica_count: Optional[int] = None,
|
|
generate_explanation: Optional[bool] = False,
|
|
explanation_metadata: Optional["aiplatform.explain.ExplanationMetadata"] = None,
|
|
explanation_parameters: Optional[
|
|
"aiplatform.explain.ExplanationParameters"
|
|
] = None,
|
|
labels: Optional[Dict[str, str]] = None,
|
|
project: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
encryption_spec_key_name: Optional[str] = None,
|
|
sync: bool = True,
|
|
create_request_timeout: Optional[float] = None,
|
|
batch_size: Optional[int] = None,
|
|
model_monitoring_objective_config: Optional[
|
|
"aiplatform.model_monitoring.ObjectiveConfig"
|
|
] = None,
|
|
model_monitoring_alert_config: Optional[
|
|
"aiplatform.model_monitoring.AlertConfig"
|
|
] = None,
|
|
analysis_instance_schema_uri: Optional[str] = None,
|
|
service_account: Optional[str] = None,
|
|
wait_for_completion: bool = False,
|
|
) -> "BatchPredictionJob":
|
|
"""Create a batch prediction job.
|
|
|
|
Args:
|
|
job_display_name (str):
|
|
Required. The user-defined name of the BatchPredictionJob.
|
|
The name can be up to 128 characters long and can be consist
|
|
of any UTF-8 characters.
|
|
model_name (Union[str, aiplatform.Model]):
|
|
Required. A fully-qualified model resource name or model ID.
|
|
Example: "projects/123/locations/us-central1/models/456" or
|
|
"456" when project and location are initialized or passed.
|
|
May optionally contain a version ID or alias in
|
|
{model_name}@{version} form.
|
|
|
|
Or an instance of aiplatform.Model.
|
|
instances_format (str):
|
|
Required. The format in which instances are provided. Must be one
|
|
of the formats listed in `Model.supported_input_storage_formats`.
|
|
Default is "jsonl" when using `gcs_source`. If a `bigquery_source`
|
|
is provided, this is overridden to "bigquery".
|
|
predictions_format (str):
|
|
Required. The format in which Vertex AI outputs the
|
|
predictions, must be one of the formats specified in
|
|
`Model.supported_output_storage_formats`.
|
|
Default is "jsonl" when using `gcs_destination_prefix`. If a
|
|
`bigquery_destination_prefix` is provided, this is overridden to
|
|
"bigquery".
|
|
gcs_source (Optional[Sequence[str]]):
|
|
Google Cloud Storage URI(-s) to your instances to run
|
|
batch prediction on. They must match `instances_format`.
|
|
|
|
bigquery_source (Optional[str]):
|
|
BigQuery URI to a table, up to 2000 characters long. For example:
|
|
`bq://projectId.bqDatasetId.bqTableId`
|
|
gcs_destination_prefix (Optional[str]):
|
|
The Google Cloud Storage location of the directory where the
|
|
output is to be written to. In the given directory a new
|
|
directory is created. Its name is
|
|
``prediction-<model-display-name>-<job-create-time>``, where
|
|
timestamp is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format.
|
|
Inside of it files ``predictions_0001.<extension>``,
|
|
``predictions_0002.<extension>``, ...,
|
|
``predictions_N.<extension>`` are created where
|
|
``<extension>`` depends on chosen ``predictions_format``,
|
|
and N may equal 0001 and depends on the total number of
|
|
successfully predicted instances. If the Model has both
|
|
``instance`` and ``prediction`` schemata defined then each such
|
|
file contains predictions as per the ``predictions_format``.
|
|
If prediction for any instance failed (partially or
|
|
completely), then an additional ``errors_0001.<extension>``,
|
|
``errors_0002.<extension>``,..., ``errors_N.<extension>``
|
|
files are created (N depends on total number of failed
|
|
predictions). These files contain the failed instances, as
|
|
per their schema, followed by an additional ``error`` field
|
|
which as value has ```google.rpc.Status`` <Status>`__
|
|
containing only ``code`` and ``message`` fields.
|
|
bigquery_destination_prefix (Optional[str]):
|
|
The BigQuery project or dataset location where the output is
|
|
to be written to. If project is provided, a new dataset is
|
|
created with name
|
|
``prediction_<model-display-name>_<job-create-time>`` where
|
|
is made BigQuery-dataset-name compatible (for example, most
|
|
special characters become underscores), and timestamp is in
|
|
YYYY_MM_DDThh_mm_ss_sssZ "based on ISO-8601" format. In the
|
|
dataset two tables will be created, ``predictions``, and
|
|
``errors``. If the Model has both
|
|
[instance][google.cloud.aiplatform.v1.PredictSchemata.instance_schema_uri]
|
|
and
|
|
[prediction][google.cloud.aiplatform.v1.PredictSchemata.parameters_schema_uri]
|
|
schemata defined then the tables have columns as follows:
|
|
The ``predictions`` table contains instances for which the
|
|
prediction succeeded, it has columns as per a concatenation
|
|
of the Model's instance and prediction schemata. The
|
|
``errors`` table contains rows for which the prediction has
|
|
failed, it has instance columns, as per the instance schema,
|
|
followed by a single "errors" column, which as values has
|
|
[google.rpc.Status][google.rpc.Status] represented as a
|
|
STRUCT, and containing only ``code`` and ``message``.
|
|
model_parameters (Optional[Dict]):
|
|
The parameters that govern the predictions. The schema of
|
|
the parameters may be specified via the Model's `parameters_schema_uri`.
|
|
machine_type (Optional[str]):
|
|
The type of machine for running batch prediction on
|
|
dedicated resources. Not specifying machine type will result in
|
|
batch prediction job being run with automatic resources.
|
|
accelerator_type (Optional[str]):
|
|
The type of accelerator(s) that may be attached
|
|
to the machine as per `accelerator_count`. Only used if
|
|
`machine_type` is set.
|
|
accelerator_count (Optional[int]):
|
|
The number of accelerators to attach to the
|
|
`machine_type`. Only used if `machine_type` is set.
|
|
starting_replica_count (Optional[int]):
|
|
The number of machine replicas used at the start of the batch
|
|
operation. If not set, Vertex AI decides starting number, not
|
|
greater than `max_replica_count`. Only used if `machine_type` is
|
|
set.
|
|
max_replica_count (Optional[int]):
|
|
The maximum number of machine replicas the batch operation may
|
|
be scaled to. Only used if `machine_type` is set.
|
|
Default is 10.
|
|
generate_explanation (bool):
|
|
Optional. Generate explanation along with the batch prediction
|
|
results. This will cause the batch prediction output to include
|
|
explanations based on the `prediction_format`:
|
|
- `bigquery`: output includes a column named `explanation`. The value
|
|
is a struct that conforms to the [aiplatform.gapic.Explanation] object.
|
|
- `jsonl`: The JSON objects on each line include an additional entry
|
|
keyed `explanation`. The value of the entry is a JSON object that
|
|
conforms to the [aiplatform.gapic.Explanation] object.
|
|
- `csv`: Generating explanations for CSV format is not supported.
|
|
explanation_metadata (aiplatform.explain.ExplanationMetadata):
|
|
Optional. Explanation metadata configuration for this BatchPredictionJob.
|
|
Can be specified only if `generate_explanation` is set to `True`.
|
|
|
|
This value overrides the value of `Model.explanation_metadata`.
|
|
All fields of `explanation_metadata` are optional in the request. If
|
|
a field of the `explanation_metadata` object is not populated, the
|
|
corresponding field of the `Model.explanation_metadata` object is inherited.
|
|
For more details, see `Ref docs <http://tinyurl.com/1igh60kt>`
|
|
explanation_parameters (aiplatform.explain.ExplanationParameters):
|
|
Optional. Parameters to configure explaining for Model's predictions.
|
|
Can be specified only if `generate_explanation` is set to `True`.
|
|
|
|
This value overrides the value of `Model.explanation_parameters`.
|
|
All fields of `explanation_parameters` are optional in the request. If
|
|
a field of the `explanation_parameters` object is not populated, the
|
|
corresponding field of the `Model.explanation_parameters` object is inherited.
|
|
For more details, see `Ref docs <http://tinyurl.com/1an4zake>`
|
|
labels (Dict[str, str]):
|
|
Optional. The labels with user-defined metadata to organize your
|
|
BatchPredictionJobs. Label keys and values can be no longer than
|
|
64 characters (Unicode codepoints), can only contain lowercase
|
|
letters, numeric characters, underscores and dashes.
|
|
International characters are allowed. See https://goo.gl/xmQnxf
|
|
for more information and examples of labels.
|
|
credentials (Optional[auth_credentials.Credentials]):
|
|
Custom credentials to use to create this batch prediction
|
|
job. Overrides credentials set in aiplatform.init.
|
|
encryption_spec_key_name (Optional[str]):
|
|
Optional. The Cloud KMS resource identifier of the customer
|
|
managed encryption key used to protect the job. Has the
|
|
form:
|
|
``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``.
|
|
The key needs to be in the same region as where the compute
|
|
resource is created.
|
|
|
|
If this is set, then all
|
|
resources created by the BatchPredictionJob will
|
|
be encrypted with the provided encryption key.
|
|
|
|
Overrides encryption_spec_key_name set in aiplatform.init.
|
|
sync (bool):
|
|
Whether to execute this method synchronously. If False, this method
|
|
will be executed in concurrent Future and any downstream object will
|
|
be immediately returned and synced when the Future has completed.
|
|
create_request_timeout (float):
|
|
Optional. The timeout for the create request in seconds.
|
|
batch_size (int):
|
|
Optional. The number of the records (e.g. instances) of the operation given in each batch
|
|
to a machine replica. Machine type, and size of a single record should be considered
|
|
when setting this parameter, higher value speeds up the batch operation's execution,
|
|
but too high value will result in a whole batch not fitting in a machine's memory,
|
|
and the whole operation will fail.
|
|
The default value is 64.
|
|
model_monitoring_objective_config (aiplatform.model_monitoring.ObjectiveConfig):
|
|
Optional. The objective config for model monitoring. Passing this parameter enables
|
|
monitoring on the model associated with this batch prediction job.
|
|
model_monitoring_alert_config (aiplatform.model_monitoring.EmailAlertConfig):
|
|
Optional. Configures how model monitoring alerts are sent to the user. Right now
|
|
only email alert is supported.
|
|
analysis_instance_schema_uri (str):
|
|
Optional. Only applicable if model_monitoring_objective_config is also passed.
|
|
This parameter specifies the YAML schema file uri describing the format of a single
|
|
instance that you want Tensorflow Data Validation (TFDV) to
|
|
analyze. If this field is empty, all the feature data types are
|
|
inferred from predict_instance_schema_uri, meaning that TFDV
|
|
will use the data in the exact format as prediction request/response.
|
|
If there are any data type differences between predict instance
|
|
and TFDV instance, this field can be used to override the schema.
|
|
For models trained with Vertex AI, this field must be set as all the
|
|
fields in predict instance formatted as string.
|
|
service_account (str):
|
|
Optional. Specifies the service account for workload run-as account.
|
|
Users submitting jobs must have act-as permission on this run-as account.
|
|
wait_for_completion (bool):
|
|
Whether to wait for the job completion.
|
|
Returns:
|
|
(jobs.BatchPredictionJob):
|
|
Instantiated representation of the created batch prediction job.
|
|
"""
|
|
# TODO(b/242108750): remove temporary logic once model monitoring for batch prediction is GA
|
|
if model_monitoring_objective_config:
|
|
from google.cloud.aiplatform.compat.types import (
|
|
batch_prediction_job_v1beta1 as gca_bp_job_compat,
|
|
io_v1beta1 as gca_io_compat,
|
|
explanation_v1beta1 as gca_explanation_v1beta1,
|
|
machine_resources_v1beta1 as gca_machine_resources_compat,
|
|
manual_batch_tuning_parameters_v1beta1 as gca_manual_batch_tuning_parameters_compat,
|
|
)
|
|
else:
|
|
from google.cloud.aiplatform.compat.types import (
|
|
batch_prediction_job as gca_bp_job_compat,
|
|
io as gca_io_compat,
|
|
explanation as gca_explanation_v1beta1,
|
|
machine_resources as gca_machine_resources_compat,
|
|
manual_batch_tuning_parameters as gca_manual_batch_tuning_parameters_compat,
|
|
)
|
|
if not job_display_name:
|
|
job_display_name = cls._generate_display_name()
|
|
|
|
utils.validate_display_name(job_display_name)
|
|
|
|
if labels:
|
|
utils.validate_labels(labels)
|
|
|
|
if isinstance(model_name, str):
|
|
try:
|
|
model_name = utils.full_resource_name(
|
|
resource_name=model_name,
|
|
resource_noun="models",
|
|
parse_resource_name_method=aiplatform.Model._parse_resource_name,
|
|
format_resource_name_method=aiplatform.Model._format_resource_name,
|
|
project=project,
|
|
location=location,
|
|
resource_id_validator=super()._revisioned_resource_id_validator,
|
|
)
|
|
except ValueError:
|
|
# Do not raise exception if model_name is a valid PublisherModel name
|
|
if not _publisher_models._PublisherModel._parse_resource_name(
|
|
model_name
|
|
):
|
|
raise
|
|
|
|
# Raise error if both or neither source URIs are provided
|
|
if bool(gcs_source) == bool(bigquery_source):
|
|
raise ValueError(
|
|
"Please provide either a gcs_source or bigquery_source, "
|
|
"but not both."
|
|
)
|
|
|
|
# Raise error if both or neither destination prefixes are provided
|
|
if bool(gcs_destination_prefix) == bool(bigquery_destination_prefix):
|
|
raise ValueError(
|
|
"Please provide either a gcs_destination_prefix or "
|
|
"bigquery_destination_prefix, but not both."
|
|
)
|
|
|
|
# Raise error if unsupported instance format is provided
|
|
if instances_format not in constants.BATCH_PREDICTION_INPUT_STORAGE_FORMATS:
|
|
raise ValueError(
|
|
f"{predictions_format} is not an accepted instances format "
|
|
f"type. Please choose from: {constants.BATCH_PREDICTION_INPUT_STORAGE_FORMATS}"
|
|
)
|
|
|
|
# Raise error if unsupported prediction format is provided
|
|
if predictions_format not in constants.BATCH_PREDICTION_OUTPUT_STORAGE_FORMATS:
|
|
raise ValueError(
|
|
f"{predictions_format} is not an accepted prediction format "
|
|
f"type. Please choose from: {constants.BATCH_PREDICTION_OUTPUT_STORAGE_FORMATS}"
|
|
)
|
|
|
|
gapic_batch_prediction_job = gca_bp_job_compat.BatchPredictionJob()
|
|
|
|
# Required Fields
|
|
gapic_batch_prediction_job.display_name = job_display_name
|
|
|
|
input_config = gca_bp_job_compat.BatchPredictionJob.InputConfig()
|
|
output_config = gca_bp_job_compat.BatchPredictionJob.OutputConfig()
|
|
|
|
if bigquery_source:
|
|
input_config.instances_format = "bigquery"
|
|
input_config.bigquery_source = gca_io_compat.BigQuerySource()
|
|
input_config.bigquery_source.input_uri = bigquery_source
|
|
else:
|
|
input_config.instances_format = instances_format
|
|
input_config.gcs_source = gca_io_compat.GcsSource(
|
|
uris=gcs_source if isinstance(gcs_source, list) else [gcs_source]
|
|
)
|
|
|
|
if bigquery_destination_prefix:
|
|
output_config.predictions_format = "bigquery"
|
|
output_config.bigquery_destination = gca_io_compat.BigQueryDestination()
|
|
|
|
bq_dest_prefix = bigquery_destination_prefix
|
|
|
|
if not bq_dest_prefix.startswith("bq://"):
|
|
bq_dest_prefix = f"bq://{bq_dest_prefix}"
|
|
|
|
output_config.bigquery_destination.output_uri = bq_dest_prefix
|
|
else:
|
|
output_config.predictions_format = predictions_format
|
|
output_config.gcs_destination = gca_io_compat.GcsDestination(
|
|
output_uri_prefix=gcs_destination_prefix
|
|
)
|
|
|
|
gapic_batch_prediction_job.input_config = input_config
|
|
gapic_batch_prediction_job.output_config = output_config
|
|
|
|
# Optional Fields
|
|
gapic_batch_prediction_job.encryption_spec = (
|
|
initializer.global_config.get_encryption_spec(
|
|
encryption_spec_key_name=encryption_spec_key_name
|
|
)
|
|
)
|
|
|
|
if model_parameters:
|
|
gapic_batch_prediction_job.model_parameters = model_parameters
|
|
|
|
# Custom Compute
|
|
if machine_type:
|
|
|
|
machine_spec = gca_machine_resources_compat.MachineSpec()
|
|
machine_spec.machine_type = machine_type
|
|
machine_spec.accelerator_type = accelerator_type
|
|
machine_spec.accelerator_count = accelerator_count
|
|
|
|
dedicated_resources = gca_machine_resources_compat.BatchDedicatedResources()
|
|
|
|
dedicated_resources.machine_spec = machine_spec
|
|
dedicated_resources.starting_replica_count = starting_replica_count
|
|
dedicated_resources.max_replica_count = max_replica_count
|
|
|
|
gapic_batch_prediction_job.dedicated_resources = dedicated_resources
|
|
|
|
manual_batch_tuning_parameters = (
|
|
gca_manual_batch_tuning_parameters_compat.ManualBatchTuningParameters()
|
|
)
|
|
manual_batch_tuning_parameters.batch_size = batch_size
|
|
|
|
gapic_batch_prediction_job.manual_batch_tuning_parameters = (
|
|
manual_batch_tuning_parameters
|
|
)
|
|
|
|
# User Labels
|
|
gapic_batch_prediction_job.labels = labels
|
|
|
|
# Explanations
|
|
if generate_explanation:
|
|
gapic_batch_prediction_job.generate_explanation = generate_explanation
|
|
|
|
if explanation_metadata or explanation_parameters:
|
|
explanation_spec = gca_explanation_compat.ExplanationSpec(
|
|
metadata=explanation_metadata, parameters=explanation_parameters
|
|
)
|
|
# TODO(b/242108750): remove temporary logic once model monitoring for batch prediction is GA
|
|
if model_monitoring_objective_config:
|
|
|
|
explanation_spec = gca_explanation_v1beta1.ExplanationSpec.deserialize(
|
|
gca_explanation_compat.ExplanationSpec.serialize(explanation_spec)
|
|
)
|
|
gapic_batch_prediction_job.explanation_spec = explanation_spec
|
|
|
|
service_account = service_account or initializer.global_config.service_account
|
|
if service_account:
|
|
gapic_batch_prediction_job.service_account = service_account
|
|
|
|
empty_batch_prediction_job = cls._empty_constructor(
|
|
project=project,
|
|
location=location,
|
|
credentials=credentials,
|
|
)
|
|
if model_monitoring_objective_config:
|
|
empty_batch_prediction_job.api_client = (
|
|
empty_batch_prediction_job.api_client.select_version("v1beta1")
|
|
)
|
|
|
|
# TODO(b/242108750): remove temporary logic once model monitoring for batch prediction is GA
|
|
if model_monitoring_objective_config:
|
|
model_monitoring_objective_config._config_for_bp = True
|
|
if model_monitoring_alert_config is not None:
|
|
model_monitoring_alert_config._config_for_bp = True
|
|
gapic_mm_config = gca_model_monitoring_v1beta1.ModelMonitoringConfig(
|
|
objective_configs=[model_monitoring_objective_config.as_proto()],
|
|
alert_config=model_monitoring_alert_config.as_proto()
|
|
if model_monitoring_alert_config is not None
|
|
else None,
|
|
analysis_instance_schema_uri=analysis_instance_schema_uri
|
|
if analysis_instance_schema_uri is not None
|
|
else None,
|
|
)
|
|
gapic_batch_prediction_job.model_monitoring_config = gapic_mm_config
|
|
|
|
# TODO(b/242108750): remove temporary logic once model monitoring for batch prediction is GA
|
|
return cls._submit_and_optionally_wait_with_sync_support(
|
|
empty_batch_prediction_job=empty_batch_prediction_job,
|
|
model_or_model_name=model_name,
|
|
gca_batch_prediction_job=gapic_batch_prediction_job,
|
|
generate_explanation=generate_explanation,
|
|
sync=sync,
|
|
create_request_timeout=create_request_timeout,
|
|
wait_for_completion=wait_for_completion,
|
|
)
|
|
|
|
@classmethod
|
|
@base.optional_sync(return_input_arg="empty_batch_prediction_job")
|
|
def _submit_and_optionally_wait_with_sync_support(
|
|
cls,
|
|
empty_batch_prediction_job: "BatchPredictionJob",
|
|
model_or_model_name: Union[str, "aiplatform.Model"],
|
|
gca_batch_prediction_job: gca_bp_job_compat.BatchPredictionJob,
|
|
generate_explanation: bool,
|
|
sync: bool = True,
|
|
create_request_timeout: Optional[float] = None,
|
|
wait_for_completion: bool = True,
|
|
) -> "BatchPredictionJob":
|
|
"""Create a batch prediction job.
|
|
|
|
Args:
|
|
empty_batch_prediction_job (BatchPredictionJob):
|
|
Required. BatchPredictionJob without _gca_resource populated.
|
|
model_or_model_name (Union[str, aiplatform.Model]):
|
|
Required. Required. A fully-qualified model resource name or
|
|
an instance of aiplatform.Model. If a resource name, it may
|
|
optionally contain a version ID or alias in
|
|
{model_name}@{version} form.
|
|
gca_batch_prediction_job (gca_bp_job.BatchPredictionJob):
|
|
Required. a batch prediction job proto for creating a batch prediction job on Vertex AI.
|
|
generate_explanation (bool):
|
|
Required. Generate explanation along with the batch prediction
|
|
results.
|
|
create_request_timeout (float):
|
|
Optional. The timeout for the create request in seconds.
|
|
wait_for_completion (bool):
|
|
Whether to wait for the job completion.
|
|
Returns:
|
|
(jobs.BatchPredictionJob):
|
|
Instantiated representation of the created batch prediction job.
|
|
|
|
Raises:
|
|
ValueError:
|
|
If no or multiple source or destinations are provided. Also, if
|
|
provided instances_format or predictions_format are not supported
|
|
by Vertex AI.
|
|
"""
|
|
|
|
parent = initializer.global_config.common_location_path(
|
|
project=empty_batch_prediction_job.project,
|
|
location=empty_batch_prediction_job.location,
|
|
)
|
|
|
|
model_resource_name = (
|
|
model_or_model_name
|
|
if isinstance(model_or_model_name, str)
|
|
else model_or_model_name.versioned_resource_name
|
|
)
|
|
|
|
gca_batch_prediction_job.model = model_resource_name
|
|
|
|
api_client = empty_batch_prediction_job.api_client
|
|
|
|
_LOGGER.log_create_with_lro(cls)
|
|
|
|
gca_batch_prediction_job = api_client.create_batch_prediction_job(
|
|
parent=parent,
|
|
batch_prediction_job=gca_batch_prediction_job,
|
|
timeout=create_request_timeout,
|
|
)
|
|
|
|
empty_batch_prediction_job._gca_resource = gca_batch_prediction_job
|
|
|
|
batch_prediction_job = empty_batch_prediction_job
|
|
|
|
_LOGGER.log_create_complete(cls, batch_prediction_job._gca_resource, "bpj")
|
|
|
|
_LOGGER.info(
|
|
"View Batch Prediction Job:\n%s" % batch_prediction_job._dashboard_uri()
|
|
)
|
|
if wait_for_completion:
|
|
batch_prediction_job._block_until_complete()
|
|
|
|
return batch_prediction_job
|
|
|
|
def iter_outputs(
|
|
self, bq_max_results: Optional[int] = 100
|
|
) -> Union[
|
|
Iterable["storage.Blob"], Iterable["bigquery.table.RowIterator"] # noqa: F821
|
|
]:
|
|
"""Returns an Iterable object to traverse the output files, either a
|
|
list of GCS Blobs or a BigQuery RowIterator depending on the output
|
|
config set when the BatchPredictionJob was created.
|
|
|
|
Args:
|
|
bq_max_results: Optional[int] = 100
|
|
Limit on rows to retrieve from prediction table in BigQuery dataset.
|
|
Only used when retrieving predictions from a bigquery_destination_prefix.
|
|
Default is 100.
|
|
|
|
Returns:
|
|
Union[Iterable[storage.Blob], Iterable[bigquery.table.RowIterator]]:
|
|
Either a list of GCS Blob objects within the prediction output
|
|
directory or an iterable BigQuery RowIterator with predictions.
|
|
|
|
Raises:
|
|
RuntimeError:
|
|
If BatchPredictionJob is in a JobState other than SUCCEEDED,
|
|
since outputs cannot be retrieved until the Job has finished.
|
|
NotImplementedError:
|
|
If BatchPredictionJob succeeded and output_info does not have a
|
|
GCS or BQ output provided.
|
|
"""
|
|
# pylint: disable=g-import-not-at-top
|
|
from google.cloud import bigquery
|
|
from google.cloud import storage
|
|
|
|
self._assert_gca_resource_is_available()
|
|
|
|
if self.state != gca_job_state.JobState.JOB_STATE_SUCCEEDED:
|
|
raise RuntimeError(
|
|
f"Cannot read outputs until BatchPredictionJob has succeeded, "
|
|
f"current state: {self._gca_resource.state}"
|
|
)
|
|
|
|
output_info = self._gca_resource.output_info
|
|
|
|
# GCS Destination, return Blobs
|
|
if output_info.gcs_output_directory:
|
|
|
|
# Build a Storage Client using the same credentials as JobServiceClient
|
|
storage_client = storage.Client(
|
|
project=self.project,
|
|
credentials=self.api_client._transport._credentials,
|
|
)
|
|
|
|
gcs_bucket, gcs_prefix = utils.extract_bucket_and_prefix_from_gcs_path(
|
|
output_info.gcs_output_directory
|
|
)
|
|
|
|
blobs = storage_client.list_blobs(gcs_bucket, prefix=gcs_prefix)
|
|
|
|
return blobs
|
|
|
|
# BigQuery Destination, return RowIterator
|
|
elif output_info.bigquery_output_dataset:
|
|
|
|
# Format of `bigquery_output_dataset` from service is `bq://projectId.bqDatasetId`
|
|
bq_dataset = output_info.bigquery_output_dataset
|
|
bq_table = output_info.bigquery_output_table
|
|
|
|
if not bq_table:
|
|
raise RuntimeError(
|
|
"A BigQuery table with predictions was not found, this "
|
|
f"might be due to errors. Visit {self._dashboard_uri()} for details."
|
|
)
|
|
|
|
if bq_dataset.startswith("bq://"):
|
|
bq_dataset = bq_dataset[5:]
|
|
|
|
# Build a BigQuery Client using the same credentials as JobServiceClient
|
|
bq_client = bigquery.Client(
|
|
project=self.project,
|
|
credentials=self.api_client._transport._credentials,
|
|
)
|
|
|
|
row_iterator = bq_client.list_rows(
|
|
table=f"{bq_dataset}.{bq_table}", max_results=bq_max_results
|
|
)
|
|
|
|
return row_iterator
|
|
|
|
# Unknown Destination type
|
|
else:
|
|
raise NotImplementedError(
|
|
f"Unsupported batch prediction output location, here are details"
|
|
f"on your prediction output:\n{output_info}"
|
|
)
|
|
|
|
def wait_for_resource_creation(self) -> None:
|
|
"""Waits until resource has been created."""
|
|
self._wait_for_resource_creation()
|
|
|
|
|
|
class _RunnableJob(_Job):
|
|
"""ABC to interface job as a runnable training class."""
|
|
|
|
def __init__(
|
|
self,
|
|
project: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
):
|
|
"""Initializes job with project, location, and api_client.
|
|
|
|
Args:
|
|
project(str): Project of the resource noun.
|
|
location(str): The location of the resource noun.
|
|
credentials(google.auth.credentials.Credentials): Optional. custom
|
|
credentials to use when accessing interacting with resource noun.
|
|
"""
|
|
|
|
base.VertexAiResourceNounWithFutureManager.__init__(
|
|
self, project=project, location=location, credentials=credentials
|
|
)
|
|
|
|
self._parent = aiplatform.initializer.global_config.common_location_path(
|
|
project=project, location=location
|
|
)
|
|
|
|
self._logged_web_access_uris = set()
|
|
|
|
@classmethod
|
|
def _empty_constructor(
|
|
cls,
|
|
project: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
resource_name: Optional[str] = None,
|
|
) -> "_RunnableJob":
|
|
"""Initializes with all attributes set to None.
|
|
|
|
The attributes should be populated after a future is complete. This allows
|
|
scheduling of additional API calls before the resource is created.
|
|
|
|
Args:
|
|
project (str): Optional. Project of the resource noun.
|
|
location (str): Optional. The location of the resource noun.
|
|
credentials(google.auth.credentials.Credentials):
|
|
Optional. custom credentials to use when accessing interacting with
|
|
resource noun.
|
|
resource_name(str): Optional. A fully-qualified resource name or ID.
|
|
Returns:
|
|
An instance of this class with attributes set to None.
|
|
"""
|
|
self = super()._empty_constructor(
|
|
project=project,
|
|
location=location,
|
|
credentials=credentials,
|
|
resource_name=resource_name,
|
|
)
|
|
|
|
self._logged_web_access_uris = set()
|
|
|
|
if isinstance(self, CustomJob):
|
|
self._enable_autolog = False
|
|
|
|
return self
|
|
|
|
@property
|
|
def web_access_uris(self) -> Dict[str, Union[str, Dict[str, str]]]:
|
|
"""Fetch the runnable job again and return the latest web access uris.
|
|
|
|
Returns:
|
|
(Dict[str, Union[str, Dict[str, str]]]):
|
|
Web access uris of the runnable job.
|
|
"""
|
|
|
|
# Fetch the Job again for most up-to-date web access uris
|
|
self._sync_gca_resource()
|
|
return self._get_web_access_uris()
|
|
|
|
@abc.abstractmethod
|
|
def _get_web_access_uris(self):
|
|
"""Helper method to get the web access uris of the runnable job"""
|
|
pass
|
|
|
|
@abc.abstractmethod
|
|
def _log_web_access_uris(self):
|
|
"""Helper method to log the web access uris of the runnable job"""
|
|
pass
|
|
|
|
def _block_until_complete(self):
|
|
"""Helper method to block and check on runnable job until complete.
|
|
|
|
Raises:
|
|
RuntimeError: If job failed or cancelled.
|
|
"""
|
|
|
|
log_wait = _LOG_WAIT_TIME
|
|
|
|
previous_time = time.time()
|
|
while self.state not in _JOB_COMPLETE_STATES:
|
|
current_time = time.time()
|
|
if current_time - previous_time >= _LOG_WAIT_TIME:
|
|
self._log_job_state()
|
|
log_wait = min(log_wait * _WAIT_TIME_MULTIPLIER, _MAX_WAIT_TIME)
|
|
previous_time = current_time
|
|
self._log_web_access_uris()
|
|
time.sleep(_JOB_WAIT_TIME)
|
|
|
|
self._log_job_state()
|
|
|
|
if isinstance(self, CustomJob):
|
|
# End the experiment run associated with the custom job, if exists.
|
|
experiment_runs = []
|
|
if self._gca_resource.job_spec.experiment_run:
|
|
experiment_runs = [self._gca_resource.job_spec.experiment_run]
|
|
elif self._gca_resource.job_spec.tensorboard:
|
|
tensorboard_id = self._gca_resource.job_spec.tensorboard.split("/")[-1]
|
|
try:
|
|
tb_runs = aiplatform.TensorboardRun.list(
|
|
tensorboard_experiment_name=self.name,
|
|
tensorboard_id=tensorboard_id,
|
|
)
|
|
experiment_runs = [
|
|
f"{self.name}-{tb_run.name.split('/')[-1]}"
|
|
for tb_run in tb_runs
|
|
]
|
|
except (ValueError, api_exceptions.GoogleAPIError) as e:
|
|
_LOGGER.warning(
|
|
f"Failed to list experiment runs for tensorboard "
|
|
f"{tensorboard_id} due to: {e}"
|
|
)
|
|
for experiment_run in experiment_runs:
|
|
try:
|
|
# sync resource before end run
|
|
experiment_run_context = aiplatform.Context(experiment_run)
|
|
experiment_run_context.update(
|
|
metadata={
|
|
metadata_constants._STATE_KEY: (
|
|
gca_execution_compat.Execution.State.COMPLETE.name
|
|
)
|
|
}
|
|
)
|
|
except (ValueError, api_exceptions.GoogleAPIError) as e:
|
|
_LOGGER.warning(
|
|
f"Failed to end experiment run {experiment_run} due to: {e}"
|
|
)
|
|
|
|
# Error is only populated when the job state is
|
|
# JOB_STATE_FAILED or JOB_STATE_CANCELLED.
|
|
if self._gca_resource.state in _JOB_ERROR_STATES:
|
|
raise RuntimeError("Job failed with:\n%s" % self._gca_resource.error)
|
|
else:
|
|
_LOGGER.log_action_completed_against_resource("run", "completed", self)
|
|
|
|
@abc.abstractmethod
|
|
def run(self) -> None:
|
|
pass
|
|
|
|
@classmethod
|
|
def get(
|
|
cls,
|
|
resource_name: str,
|
|
project: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
) -> "_RunnableJob":
|
|
"""Get a Vertex AI Job for the given resource_name.
|
|
|
|
Args:
|
|
resource_name (str):
|
|
Required. A fully-qualified resource name or ID.
|
|
project (str):
|
|
Optional. project to retrieve dataset from. If not set, project
|
|
set in aiplatform.init will be used.
|
|
location (str):
|
|
Optional. location to retrieve dataset from. If not set, location
|
|
set in aiplatform.init will be used.
|
|
credentials (auth_credentials.Credentials):
|
|
Custom credentials to use to upload this model. Overrides
|
|
credentials set in aiplatform.init.
|
|
|
|
Returns:
|
|
A Vertex AI Job.
|
|
"""
|
|
self = cls._empty_constructor(
|
|
project=project,
|
|
location=location,
|
|
credentials=credentials,
|
|
resource_name=resource_name,
|
|
)
|
|
|
|
self._gca_resource = self._get_gca_resource(resource_name=resource_name)
|
|
|
|
return self
|
|
|
|
def wait_for_resource_creation(self) -> None:
|
|
"""Waits until resource has been created."""
|
|
self._wait_for_resource_creation()
|
|
|
|
|
|
class DataLabelingJob(_Job):
|
|
_resource_noun = "dataLabelingJobs"
|
|
_getter_method = "get_data_labeling_job"
|
|
_list_method = "list_data_labeling_jobs"
|
|
_cancel_method = "cancel_data_labeling_job"
|
|
_delete_method = "delete_data_labeling_job"
|
|
_job_type = "labeling-tasks"
|
|
_parse_resource_name_method = "parse_data_labeling_job_path"
|
|
_format_resource_name_method = "data_labeling_job_path"
|
|
pass
|
|
|
|
|
|
class CustomJob(_RunnableJob, base.PreviewMixin):
|
|
"""Vertex AI Custom Job."""
|
|
|
|
_resource_noun = "customJobs"
|
|
_getter_method = "get_custom_job"
|
|
_list_method = "list_custom_jobs"
|
|
_cancel_method = "cancel_custom_job"
|
|
_delete_method = "delete_custom_job"
|
|
_parse_resource_name_method = "parse_custom_job_path"
|
|
_format_resource_name_method = "custom_job_path"
|
|
_job_type = "training"
|
|
_preview_class = "google.cloud.aiplatform.aiplatform.preview.jobs.CustomJob"
|
|
|
|
def __init__(
|
|
self,
|
|
# TODO(b/223262536): Make display_name parameter fully optional in next major release
|
|
display_name: str,
|
|
worker_pool_specs: Union[List[Dict], List[custom_job_v1.WorkerPoolSpec]],
|
|
base_output_dir: Optional[str] = None,
|
|
project: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
labels: Optional[Dict[str, str]] = None,
|
|
encryption_spec_key_name: Optional[str] = None,
|
|
staging_bucket: Optional[str] = None,
|
|
persistent_resource_id: Optional[str] = None,
|
|
):
|
|
"""Constructs a Custom Job with Worker Pool Specs.
|
|
|
|
```
|
|
Example usage:
|
|
worker_pool_specs = [
|
|
{
|
|
"machine_spec": {
|
|
"machine_type": "n1-standard-4",
|
|
"accelerator_type": "NVIDIA_TESLA_K80",
|
|
"accelerator_count": 1,
|
|
},
|
|
"replica_count": 1,
|
|
"container_spec": {
|
|
"image_uri": container_image_uri,
|
|
"command": [],
|
|
"args": [],
|
|
},
|
|
}
|
|
]
|
|
|
|
my_job = aiplatform.CustomJob(
|
|
display_name='my_job',
|
|
worker_pool_specs=worker_pool_specs,
|
|
labels={'my_key': 'my_value'},
|
|
)
|
|
|
|
my_job.run()
|
|
```
|
|
|
|
|
|
For more information on configuring worker pool specs please visit:
|
|
https://cloud.google.com/ai-platform-unified/docs/training/create-custom-job
|
|
|
|
|
|
Args:
|
|
display_name (str):
|
|
Required. The user-defined name of the HyperparameterTuningJob.
|
|
The name can be up to 128 characters long and can be consist
|
|
of any UTF-8 characters.
|
|
worker_pool_specs (Union[List[Dict], List[aiplatform.gapic.WorkerPoolSpec]]):
|
|
Required. The spec of the worker pools including machine type and Docker image.
|
|
Can provided as a list of dictionaries or list of WorkerPoolSpec proto messages.
|
|
base_output_dir (str):
|
|
Optional. GCS output directory of job. If not provided a
|
|
timestamped directory in the staging directory will be used.
|
|
project (str):
|
|
Optional.Project to run the custom job in. Overrides project set in aiplatform.init.
|
|
location (str):
|
|
Optional.Location to run the custom job in. Overrides location set in aiplatform.init.
|
|
credentials (auth_credentials.Credentials):
|
|
Optional.Custom credentials to use to run call custom job service. Overrides
|
|
credentials set in aiplatform.init.
|
|
labels (Dict[str, str]):
|
|
Optional. The labels with user-defined metadata to
|
|
organize CustomJobs.
|
|
Label keys and values can be no longer than 64
|
|
characters (Unicode codepoints), can only
|
|
contain lowercase letters, numeric characters,
|
|
underscores and dashes. International characters
|
|
are allowed.
|
|
See https://goo.gl/xmQnxf for more information
|
|
and examples of labels.
|
|
encryption_spec_key_name (str):
|
|
Optional.Customer-managed encryption key name for a
|
|
CustomJob. If this is set, then all resources
|
|
created by the CustomJob will be encrypted with
|
|
the provided encryption key.
|
|
staging_bucket (str):
|
|
Optional. Bucket for produced custom job artifacts. Overrides
|
|
staging_bucket set in aiplatform.init.
|
|
persistent_resource_id (str):
|
|
Optional. The ID of the PersistentResource in the same Project
|
|
and Location. If this is specified, the job will be run on
|
|
existing machines held by the PersistentResource instead of
|
|
on-demand short-live machines. The network and CMEK configs on
|
|
the job should be consistent with those on the PersistentResource,
|
|
otherwise, the job will be rejected.
|
|
|
|
Raises:
|
|
RuntimeError: If staging bucket was not set using aiplatform.init
|
|
and a staging bucket was not passed in.
|
|
"""
|
|
|
|
super().__init__(project=project, location=location, credentials=credentials)
|
|
|
|
staging_bucket = staging_bucket or initializer.global_config.staging_bucket
|
|
|
|
if not staging_bucket:
|
|
raise RuntimeError(
|
|
"staging_bucket should be passed to CustomJob constructor or "
|
|
"should be set using aiplatform.init(staging_bucket='gs://my-bucket')"
|
|
)
|
|
|
|
if labels:
|
|
utils.validate_labels(labels)
|
|
|
|
# default directory if not given
|
|
base_output_dir = base_output_dir or utils._timestamped_gcs_dir(
|
|
staging_bucket, "aiplatform-custom-job"
|
|
)
|
|
|
|
if not display_name:
|
|
display_name = self.__class__._generate_display_name()
|
|
|
|
self._gca_resource = gca_custom_job_compat.CustomJob(
|
|
display_name=display_name,
|
|
job_spec=gca_custom_job_compat.CustomJobSpec(
|
|
worker_pool_specs=worker_pool_specs,
|
|
base_output_directory=gca_io_compat.GcsDestination(
|
|
output_uri_prefix=base_output_dir
|
|
),
|
|
persistent_resource_id=persistent_resource_id,
|
|
),
|
|
labels=labels,
|
|
encryption_spec=initializer.global_config.get_encryption_spec(
|
|
encryption_spec_key_name=encryption_spec_key_name
|
|
),
|
|
)
|
|
|
|
self._enable_autolog = False
|
|
|
|
@property
|
|
def network(self) -> Optional[str]:
|
|
"""The full name of the Google Compute Engine
|
|
[network](https://cloud.google.com/vpc/docs/vpc#networks) to which this
|
|
CustomJob should be peered.
|
|
|
|
Takes the format `projects/{project}/global/networks/{network}`. Where
|
|
{project} is a project number, as in `12345`, and {network} is a network name.
|
|
|
|
Private services access must already be configured for the network. If left
|
|
unspecified, the CustomJob is not peered with any network.
|
|
"""
|
|
self._assert_gca_resource_is_available()
|
|
return self._gca_resource.job_spec.network
|
|
|
|
def _get_web_access_uris(self) -> Dict[str, str]:
|
|
"""Helper method to get the web access uris of the custom job
|
|
|
|
Returns:
|
|
(Dict[str, str]):
|
|
Web access uris of the custom job.
|
|
"""
|
|
return dict(self._gca_resource.web_access_uris)
|
|
|
|
def _log_web_access_uris(self):
|
|
"""Helper method to log the web access uris of the custom job"""
|
|
|
|
for worker, uri in self._get_web_access_uris().items():
|
|
if uri not in self._logged_web_access_uris:
|
|
_LOGGER.info(
|
|
"%s %s access the interactive shell terminals for the custom job:\n%s:\n%s"
|
|
% (
|
|
self.__class__.__name__,
|
|
self._gca_resource.name,
|
|
worker,
|
|
uri,
|
|
),
|
|
)
|
|
self._logged_web_access_uris.add(uri)
|
|
|
|
@classmethod
|
|
def from_local_script(
|
|
cls,
|
|
# TODO(b/223262536): Make display_name parameter fully optional in next major release
|
|
display_name: str,
|
|
script_path: str,
|
|
container_uri: str,
|
|
enable_autolog: bool = False,
|
|
args: Optional[Sequence[str]] = None,
|
|
requirements: Optional[Sequence[str]] = None,
|
|
environment_variables: Optional[Dict[str, str]] = None,
|
|
replica_count: int = 1,
|
|
machine_type: str = "n1-standard-4",
|
|
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
|
|
accelerator_count: int = 0,
|
|
boot_disk_type: str = "pd-ssd",
|
|
boot_disk_size_gb: int = 100,
|
|
reduction_server_replica_count: int = 0,
|
|
reduction_server_machine_type: Optional[str] = None,
|
|
reduction_server_container_uri: Optional[str] = None,
|
|
base_output_dir: Optional[str] = None,
|
|
project: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
labels: Optional[Dict[str, str]] = None,
|
|
encryption_spec_key_name: Optional[str] = None,
|
|
staging_bucket: Optional[str] = None,
|
|
persistent_resource_id: Optional[str] = None,
|
|
tpu_topology: Optional[str] = None,
|
|
) -> "CustomJob":
|
|
"""Configures a custom job from a local script.
|
|
|
|
Example usage:
|
|
```
|
|
job = aiplatform.CustomJob.from_local_script(
|
|
display_name="my-custom-job",
|
|
script_path="training_script.py",
|
|
container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-2:latest",
|
|
requirements=["gcsfs==0.7.1"],
|
|
replica_count=1,
|
|
args=['--dataset', 'gs://my-bucket/my-dataset',
|
|
'--model_output_uri', 'gs://my-bucket/model']
|
|
labels={'my_key': 'my_value'},
|
|
)
|
|
|
|
job.run()
|
|
```
|
|
|
|
Args:
|
|
display_name (str):
|
|
Required. The user-defined name of this CustomJob.
|
|
script_path (str):
|
|
Required. Local path to training script.
|
|
container_uri (str):
|
|
Required. Uri of the training container image to use for custom job.
|
|
Support images in Artifact Registry, Container Registry, or Docker Hub.
|
|
Vertex AI provides a wide range of executor images with pre-installed
|
|
packages to meet users' various use cases. See the list of `pre-built containers
|
|
for training <https://cloud.google.com/vertex-ai/docs/training/pre-built-containers>`.
|
|
If not using image from this list, please make sure python3 and pip3 are installed in your container.
|
|
enable_autolog (bool):
|
|
Optional. If True, the Vertex Experiments autologging feature will be
|
|
enabled in the CustomJob. Note that this will wrap your training script
|
|
with some autologging-related code.
|
|
args (Optional[Sequence[str]]):
|
|
Optional. Command line arguments to be passed to the Python task.
|
|
requirements (Sequence[str]):
|
|
Optional. List of python packages dependencies of script.
|
|
environment_variables (Dict[str, str]):
|
|
Optional. Environment variables to be passed to the container.
|
|
Should be a dictionary where keys are environment variable names
|
|
and values are environment variable values for those names.
|
|
At most 10 environment variables can be specified.
|
|
The Name of the environment variable must be unique.
|
|
|
|
environment_variables = {
|
|
'MY_KEY': 'MY_VALUE'
|
|
}
|
|
replica_count (int):
|
|
Optional. The number of worker replicas. If replica count = 1 then one chief
|
|
replica will be provisioned. If replica_count > 1 the remainder will be
|
|
provisioned as a worker replica pool.
|
|
machine_type (str):
|
|
Optional. The type of machine to use for training.
|
|
accelerator_type (str):
|
|
Optional. Hardware accelerator type. One of ACCELERATOR_TYPE_UNSPECIFIED,
|
|
NVIDIA_TESLA_K80, NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, NVIDIA_TESLA_P4,
|
|
NVIDIA_TESLA_T4
|
|
accelerator_count (int):
|
|
Optional. The number of accelerators to attach to a worker replica.
|
|
boot_disk_type (str):
|
|
Optional. Type of the boot disk, default is `pd-ssd`.
|
|
Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
|
|
`pd-standard` (Persistent Disk Hard Disk Drive).
|
|
boot_disk_size_gb (int):
|
|
Optional. Size in GB of the boot disk, default is 100GB.
|
|
boot disk size must be within the range of [100, 64000].
|
|
reduction_server_replica_count (int):
|
|
The number of reduction server replicas, default is 0.
|
|
reduction_server_machine_type (str):
|
|
Optional. The type of machine to use for reduction server.
|
|
reduction_server_container_uri (str):
|
|
Optional. The Uri of the reduction server container image.
|
|
See details: https://cloud.google.com/vertex-ai/docs/training/distributed-training#reduce_training_time_with_reduction_server
|
|
base_output_dir (str):
|
|
Optional. GCS output directory of job. If not provided a
|
|
timestamped directory in the staging directory will be used.
|
|
project (str):
|
|
Optional. Project to run the custom job in. Overrides project set in aiplatform.init.
|
|
location (str):
|
|
Optional. Location to run the custom job in. Overrides location set in aiplatform.init.
|
|
credentials (auth_credentials.Credentials):
|
|
Optional. Custom credentials to use to run call custom job service. Overrides
|
|
credentials set in aiplatform.init.
|
|
labels (Dict[str, str]):
|
|
Optional. The labels with user-defined metadata to
|
|
organize CustomJobs.
|
|
Label keys and values can be no longer than 64
|
|
characters (Unicode codepoints), can only
|
|
contain lowercase letters, numeric characters,
|
|
underscores and dashes. International characters
|
|
are allowed.
|
|
See https://goo.gl/xmQnxf for more information
|
|
and examples of labels.
|
|
encryption_spec_key_name (str):
|
|
Optional. Customer-managed encryption key name for a
|
|
CustomJob. If this is set, then all resources
|
|
created by the CustomJob will be encrypted with
|
|
the provided encryption key.
|
|
staging_bucket (str):
|
|
Optional. Bucket for produced custom job artifacts. Overrides
|
|
staging_bucket set in aiplatform.init.
|
|
persistent_resource_id (str):
|
|
Optional. The ID of the PersistentResource in the same Project
|
|
and Location. If this is specified, the job will be run on
|
|
existing machines held by the PersistentResource instead of
|
|
on-demand short-live machines. The network, CMEK, and node pool
|
|
configs on the job should be consistent with those on the
|
|
PersistentResource, otherwise, the job will be rejected.
|
|
tpu_topology (str):
|
|
Optional. Specifies the tpu topology to be used for
|
|
TPU training job. This field is required for TPU v5 versions. For
|
|
details on the TPU topology, refer to
|
|
https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology
|
|
must be a supported value for the TPU machine type.
|
|
|
|
Raises:
|
|
RuntimeError: If staging bucket was not set using aiplatform.init
|
|
and a staging bucket was not passed in.
|
|
"""
|
|
|
|
project = project or initializer.global_config.project
|
|
location = location or initializer.global_config.location
|
|
staging_bucket = staging_bucket or initializer.global_config.staging_bucket
|
|
|
|
if not staging_bucket:
|
|
raise RuntimeError(
|
|
"staging_bucket should be passed to CustomJob.from_local_script or "
|
|
"should be set using aiplatform.init(staging_bucket='gs://my-bucket')"
|
|
)
|
|
|
|
if labels:
|
|
utils.validate_labels(labels)
|
|
|
|
worker_pool_specs = (
|
|
worker_spec_utils._DistributedTrainingSpec.chief_worker_pool(
|
|
replica_count=replica_count,
|
|
machine_type=machine_type,
|
|
accelerator_count=accelerator_count,
|
|
accelerator_type=accelerator_type,
|
|
boot_disk_type=boot_disk_type,
|
|
boot_disk_size_gb=boot_disk_size_gb,
|
|
reduction_server_replica_count=reduction_server_replica_count,
|
|
reduction_server_machine_type=reduction_server_machine_type,
|
|
tpu_topology=tpu_topology,
|
|
).pool_specs
|
|
)
|
|
|
|
# if users enable autolog, automatically install SDK in their container image
|
|
# otherwise users need to manually install SDK
|
|
if enable_autolog:
|
|
experiment_requirements = [constants.AIPLATFORM_AUTOLOG_DEPENDENCY_PATH]
|
|
else:
|
|
experiment_requirements = []
|
|
|
|
if requirements:
|
|
requirements.extend(experiment_requirements)
|
|
else:
|
|
requirements = experiment_requirements
|
|
|
|
if enable_autolog:
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
autolog_script_path = f"{temp_dir}/trainer_with_autolog.py"
|
|
with open(autolog_script_path, "w") as f:
|
|
autolog_script = (
|
|
"# Start a Vertex Experiments autolog session...\n"
|
|
"from google.cloud "
|
|
"import aiplatform\n"
|
|
"aiplatform.autolog()\n\n"
|
|
"# Training script...\n"
|
|
)
|
|
f.write(autolog_script)
|
|
|
|
trainer_script = open(script_path, "r").read()
|
|
f.write(trainer_script)
|
|
|
|
python_packager = source_utils._TrainingScriptPythonPackager(
|
|
script_path=autolog_script_path, requirements=requirements
|
|
)
|
|
|
|
package_gcs_uri = python_packager.package_and_copy_to_gcs(
|
|
gcs_staging_dir=staging_bucket,
|
|
project=project,
|
|
credentials=credentials,
|
|
)
|
|
else:
|
|
python_packager = source_utils._TrainingScriptPythonPackager(
|
|
script_path=script_path, requirements=requirements
|
|
)
|
|
|
|
package_gcs_uri = python_packager.package_and_copy_to_gcs(
|
|
gcs_staging_dir=staging_bucket,
|
|
project=project,
|
|
credentials=credentials,
|
|
)
|
|
|
|
for spec_order, spec in enumerate(worker_pool_specs):
|
|
|
|
if not spec:
|
|
continue
|
|
|
|
if (
|
|
spec_order == worker_spec_utils._SPEC_ORDERS["server_spec"]
|
|
and reduction_server_replica_count > 0
|
|
):
|
|
spec["container_spec"] = {
|
|
"image_uri": reduction_server_container_uri,
|
|
}
|
|
## check if the container is pre-built
|
|
elif ("docker.pkg.dev/vertex-ai/" in container_uri) or (
|
|
"gcr.io/cloud-aiplatform/" in container_uri
|
|
):
|
|
spec["python_package_spec"] = {
|
|
"executor_image_uri": container_uri,
|
|
"python_module": python_packager.module_name,
|
|
"package_uris": [package_gcs_uri],
|
|
}
|
|
|
|
if args:
|
|
spec["python_package_spec"]["args"] = args
|
|
|
|
if environment_variables:
|
|
spec["python_package_spec"]["env"] = [
|
|
{"name": key, "value": value}
|
|
for key, value in environment_variables.items()
|
|
]
|
|
else:
|
|
command = [
|
|
"sh",
|
|
"-c",
|
|
"pip install --upgrade pip && "
|
|
+ f"pip3 install -q --user {package_gcs_uri} && ".replace(
|
|
"gs://", "/gcs/"
|
|
)
|
|
+ f"python3 -m {python_packager.module_name}",
|
|
]
|
|
|
|
if args:
|
|
command[-1] += " " + " ".join(args)
|
|
|
|
spec["container_spec"] = {
|
|
"image_uri": container_uri,
|
|
"command": command,
|
|
}
|
|
|
|
if environment_variables:
|
|
spec["container_spec"]["env"] = [
|
|
{"name": key, "value": value}
|
|
for key, value in environment_variables.items()
|
|
]
|
|
|
|
job = cls(
|
|
display_name=display_name,
|
|
worker_pool_specs=worker_pool_specs,
|
|
base_output_dir=base_output_dir,
|
|
project=project,
|
|
location=location,
|
|
credentials=credentials,
|
|
labels=labels,
|
|
encryption_spec_key_name=encryption_spec_key_name,
|
|
staging_bucket=staging_bucket,
|
|
persistent_resource_id=persistent_resource_id,
|
|
)
|
|
|
|
if enable_autolog:
|
|
job._enable_autolog = True
|
|
|
|
return job
|
|
|
|
def run(
|
|
self,
|
|
service_account: Optional[str] = None,
|
|
network: Optional[str] = None,
|
|
timeout: Optional[int] = None,
|
|
restart_job_on_worker_restart: bool = False,
|
|
enable_web_access: bool = False,
|
|
experiment: Optional[Union["aiplatform.Experiment", str]] = None,
|
|
experiment_run: Optional[Union["aiplatform.ExperimentRun", str]] = None,
|
|
tensorboard: Optional[str] = None,
|
|
sync: bool = True,
|
|
create_request_timeout: Optional[float] = None,
|
|
disable_retries: bool = False,
|
|
persistent_resource_id: Optional[str] = None,
|
|
scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None,
|
|
max_wait_duration: Optional[int] = None,
|
|
) -> None:
|
|
"""Run this configured CustomJob.
|
|
|
|
Args:
|
|
service_account (str):
|
|
Optional. Specifies the service account for workload run-as account.
|
|
Users submitting jobs must have act-as permission on this run-as account.
|
|
network (str):
|
|
Optional. The full name of the Compute Engine network to which the job
|
|
should be peered. For example, projects/12345/global/networks/myVPC.
|
|
Private services access must already be configured for the network.
|
|
If left unspecified, the network set in aiplatform.init will be used.
|
|
Otherwise, the job is not peered with any network.
|
|
timeout (int):
|
|
The maximum job running time in seconds. The default is 7 days.
|
|
restart_job_on_worker_restart (bool):
|
|
Restarts the entire CustomJob if a worker
|
|
gets restarted. This feature can be used by
|
|
distributed training jobs that are not resilient
|
|
to workers leaving and joining a job.
|
|
enable_web_access (bool):
|
|
Whether you want Vertex AI to enable interactive shell access
|
|
to training containers.
|
|
https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell
|
|
experiment (Union[aiplatform.Experiment, str]):
|
|
Optional. The instance or name of an Experiment resource to which
|
|
this CustomJob will upload training parameters and metrics.
|
|
|
|
`service_account` is required with provided `experiment`.
|
|
For more information on configuring your service account please visit:
|
|
https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
|
|
experiment_run (Union[aiplatform.ExperimentRun, str]):
|
|
Optional. The instance or name of an ExperimentRun resource to which
|
|
this CustomJob will upload training parameters and metrics.
|
|
This arg can only be set when `experiment` is set. If 'experiment'
|
|
is set but 'experiment_run` is not, an ExperimentRun resource
|
|
will still be auto-generated.
|
|
tensorboard (str):
|
|
Optional. The name of a Vertex AI
|
|
[Tensorboard][google.cloud.aiplatform.v1beta1.Tensorboard]
|
|
resource to which this CustomJob will upload Tensorboard
|
|
logs. Format:
|
|
``projects/{project}/locations/{location}/tensorboards/{tensorboard}``
|
|
|
|
The training script should write Tensorboard to following Vertex AI environment
|
|
variable:
|
|
|
|
AIP_TENSORBOARD_LOG_DIR
|
|
|
|
`service_account` is required with provided `tensorboard`.
|
|
For more information on configuring your service account please visit:
|
|
https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
|
|
sync (bool):
|
|
Whether to execute this method synchronously. If False, this method
|
|
will unblock and it will be executed in a concurrent Future.
|
|
create_request_timeout (float):
|
|
Optional. The timeout for the create request in seconds.
|
|
disable_retries (bool):
|
|
Indicates if the job should retry for internal errors after the
|
|
job starts running. If True, overrides
|
|
`restart_job_on_worker_restart` to False.
|
|
persistent_resource_id (str):
|
|
Optional. The ID of the PersistentResource in the same Project
|
|
and Location. If this is specified, the job will be run on
|
|
existing machines held by the PersistentResource instead of
|
|
on-demand short-live machines. The network, CMEK, and node pool
|
|
configs on the job should be consistent with those on the
|
|
PersistentResource, otherwise, the job will be rejected.
|
|
scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy):
|
|
Optional. Indicates the job scheduling strategy.
|
|
max_wait_duration (int):
|
|
This is the maximum duration that a job will wait for the
|
|
requested resources to be provisioned in seconds. If set to 0,
|
|
the job will wait indefinitely. The default is 1 day.
|
|
"""
|
|
network = network or initializer.global_config.network
|
|
service_account = service_account or initializer.global_config.service_account
|
|
|
|
self._run(
|
|
service_account=service_account,
|
|
network=network,
|
|
timeout=timeout,
|
|
restart_job_on_worker_restart=restart_job_on_worker_restart,
|
|
enable_web_access=enable_web_access,
|
|
experiment=experiment,
|
|
experiment_run=experiment_run,
|
|
tensorboard=tensorboard,
|
|
sync=sync,
|
|
create_request_timeout=create_request_timeout,
|
|
disable_retries=disable_retries,
|
|
persistent_resource_id=persistent_resource_id,
|
|
scheduling_strategy=scheduling_strategy,
|
|
max_wait_duration=max_wait_duration,
|
|
)
|
|
|
|
@base.optional_sync()
|
|
def _run(
|
|
self,
|
|
service_account: Optional[str] = None,
|
|
network: Optional[str] = None,
|
|
timeout: Optional[int] = None,
|
|
restart_job_on_worker_restart: bool = False,
|
|
enable_web_access: bool = False,
|
|
experiment: Optional[Union["aiplatform.Experiment", str]] = None,
|
|
experiment_run: Optional[Union["aiplatform.ExperimentRun", str]] = None,
|
|
tensorboard: Optional[str] = None,
|
|
sync: bool = True,
|
|
create_request_timeout: Optional[float] = None,
|
|
disable_retries: bool = False,
|
|
persistent_resource_id: Optional[str] = None,
|
|
scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None,
|
|
max_wait_duration: Optional[int] = None,
|
|
) -> None:
|
|
"""Helper method to ensure network synchronization and to run the configured CustomJob.
|
|
|
|
Args:
|
|
service_account (str):
|
|
Optional. Specifies the service account for workload run-as account.
|
|
Users submitting jobs must have act-as permission on this run-as account.
|
|
network (str):
|
|
Optional. The full name of the Compute Engine network to which the job
|
|
should be peered. For example, projects/12345/global/networks/myVPC.
|
|
Private services access must already be configured for the network.
|
|
timeout (int):
|
|
The maximum job running time in seconds. The default is 7 days.
|
|
restart_job_on_worker_restart (bool):
|
|
Restarts the entire CustomJob if a worker
|
|
gets restarted. This feature can be used by
|
|
distributed training jobs that are not resilient
|
|
to workers leaving and joining a job.
|
|
enable_web_access (bool):
|
|
Whether you want Vertex AI to enable interactive shell access
|
|
to training containers.
|
|
https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell
|
|
experiment (Union[aiplatform.Experiment, str]):
|
|
Optional. The instance or name of an Experiment resource to which
|
|
this CustomJob will upload training parameters and metrics.
|
|
|
|
`service_account` is required with provided `experiment`.
|
|
For more information on configuring your service account please visit:
|
|
https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
|
|
experiment_run (Union[aiplatform.ExperimentRun, str]):
|
|
Optional. The instance or name of an ExperimentRun resource to which
|
|
this CustomJob will upload training parameters and metrics.
|
|
This arg can only be set when `experiment` is set. If 'experiment'
|
|
is set but 'experiment_run` is not, an ExperimentRun resource
|
|
will still be auto-generated.
|
|
tensorboard (str):
|
|
Optional. The name of a Vertex AI
|
|
[Tensorboard][google.cloud.aiplatform.v1beta1.Tensorboard]
|
|
resource to which this CustomJob will upload Tensorboard
|
|
logs. Format:
|
|
``projects/{project}/locations/{location}/tensorboards/{tensorboard}``
|
|
|
|
The training script should write Tensorboard to following Vertex AI environment
|
|
variable:
|
|
|
|
AIP_TENSORBOARD_LOG_DIR
|
|
|
|
`service_account` is required with provided `tensorboard`.
|
|
For more information on configuring your service account please visit:
|
|
https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
|
|
sync (bool):
|
|
Whether to execute this method synchronously. If False, this method
|
|
will unblock and it will be executed in a concurrent Future.
|
|
create_request_timeout (float):
|
|
Optional. The timeout for the create request in seconds.
|
|
disable_retries (bool):
|
|
Indicates if the job should retry for internal errors after the
|
|
job starts running. If True, overrides
|
|
`restart_job_on_worker_restart` to False.
|
|
persistent_resource_id (str):
|
|
Optional. The ID of the PersistentResource in the same Project
|
|
and Location. If this is specified, the job will be run on
|
|
existing machines held by the PersistentResource instead of
|
|
on-demand short-live machines. The network, CMEK, and node pool
|
|
configs on the job should be consistent with those on the
|
|
PersistentResource, otherwise, the job will be rejected.
|
|
scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy):
|
|
Optional. Indicates the job scheduling strategy.
|
|
max_wait_duration (int):
|
|
This is the maximum duration that a job will wait for the
|
|
requested resources to be provisioned in seconds. If set to 0,
|
|
the job will wait indefinitely. The default is 1 day.
|
|
"""
|
|
self.submit(
|
|
service_account=service_account,
|
|
network=network,
|
|
timeout=timeout,
|
|
restart_job_on_worker_restart=restart_job_on_worker_restart,
|
|
enable_web_access=enable_web_access,
|
|
experiment=experiment,
|
|
experiment_run=experiment_run,
|
|
tensorboard=tensorboard,
|
|
create_request_timeout=create_request_timeout,
|
|
disable_retries=disable_retries,
|
|
persistent_resource_id=persistent_resource_id,
|
|
scheduling_strategy=scheduling_strategy,
|
|
max_wait_duration=max_wait_duration,
|
|
)
|
|
|
|
self._block_until_complete()
|
|
|
|
def submit(
|
|
self,
|
|
*,
|
|
service_account: Optional[str] = None,
|
|
network: Optional[str] = None,
|
|
timeout: Optional[int] = None,
|
|
restart_job_on_worker_restart: bool = False,
|
|
enable_web_access: bool = False,
|
|
experiment: Optional[Union["aiplatform.Experiment", str]] = None,
|
|
experiment_run: Optional[Union["aiplatform.ExperimentRun", str]] = None,
|
|
tensorboard: Optional[str] = None,
|
|
create_request_timeout: Optional[float] = None,
|
|
disable_retries: bool = False,
|
|
persistent_resource_id: Optional[str] = None,
|
|
scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None,
|
|
max_wait_duration: Optional[int] = None,
|
|
) -> None:
|
|
"""Submit the configured CustomJob.
|
|
|
|
Args:
|
|
service_account (str):
|
|
Optional. Specifies the service account for workload run-as account.
|
|
Users submitting jobs must have act-as permission on this run-as account.
|
|
network (str):
|
|
Optional. The full name of the Compute Engine network to which the job
|
|
should be peered. For example, projects/12345/global/networks/myVPC.
|
|
Private services access must already be configured for the network.
|
|
timeout (int):
|
|
The maximum job running time in seconds. The default is 7 days.
|
|
restart_job_on_worker_restart (bool):
|
|
Restarts the entire CustomJob if a worker
|
|
gets restarted. This feature can be used by
|
|
distributed training jobs that are not resilient
|
|
to workers leaving and joining a job.
|
|
enable_web_access (bool):
|
|
Whether you want Vertex AI to enable interactive shell access
|
|
to training containers.
|
|
https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell
|
|
experiment (Union[aiplatform.Experiment, str]):
|
|
Optional. The instance or name of an Experiment resource to which
|
|
this CustomJob will upload training parameters and metrics.
|
|
|
|
`service_account` is required with provided `experiment`.
|
|
For more information on configuring your service account please visit:
|
|
https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
|
|
experiment_run (Union[aiplatform.ExperimentRun, str]):
|
|
Optional. The instance or name of an ExperimentRun resource to which
|
|
this CustomJob will upload training parameters and metrics.
|
|
This arg can only be set when `experiment` is set. If 'experiment'
|
|
is set but 'experiment_run` is not, an ExperimentRun resource
|
|
will still be auto-generated.
|
|
tensorboard (str):
|
|
Optional. The name of a Vertex AI
|
|
[Tensorboard][google.cloud.aiplatform.v1beta1.Tensorboard]
|
|
resource to which this CustomJob will upload Tensorboard
|
|
logs. Format:
|
|
``projects/{project}/locations/{location}/tensorboards/{tensorboard}``
|
|
|
|
The training script should write Tensorboard to following Vertex AI environment
|
|
variable:
|
|
|
|
AIP_TENSORBOARD_LOG_DIR
|
|
|
|
`service_account` is required with provided `tensorboard`.
|
|
For more information on configuring your service account please visit:
|
|
https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
|
|
create_request_timeout (float):
|
|
Optional. The timeout for the create request in seconds.
|
|
disable_retries (bool):
|
|
Indicates if the job should retry for internal errors after the
|
|
job starts running. If True, overrides
|
|
`restart_job_on_worker_restart` to False.
|
|
persistent_resource_id (str):
|
|
Optional. The ID of the PersistentResource in the same Project
|
|
and Location. If this is specified, the job will be run on
|
|
existing machines held by the PersistentResource instead of
|
|
on-demand short-live machines. The network, CMEK, and node pool
|
|
configs on the job should be consistent with those on the
|
|
PersistentResource, otherwise, the job will be rejected.
|
|
scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy):
|
|
Optional. Indicates the job scheduling strategy.
|
|
max_wait_duration (int):
|
|
This is the maximum duration that a job will wait for the
|
|
requested resources to be provisioned in seconds. If set to 0,
|
|
the job will wait indefinitely. The default is 1 day.
|
|
|
|
Raises:
|
|
ValueError:
|
|
If both `experiment` and `tensorboard` are specified or if
|
|
`enable_autolog` is True in `CustomJob.from_local_script` but
|
|
`experiment` is not specified or the specified experiment
|
|
doesn't have a backing tensorboard.
|
|
"""
|
|
if experiment and tensorboard:
|
|
raise ValueError("'experiment' and 'tensorboard' cannot be set together.")
|
|
if self._enable_autolog and (not experiment):
|
|
raise ValueError(
|
|
"'experiment' is required since you've enabled autolog in 'from_local_script'."
|
|
)
|
|
|
|
service_account = service_account or initializer.global_config.service_account
|
|
if service_account:
|
|
self._gca_resource.job_spec.service_account = service_account
|
|
|
|
if network:
|
|
self._gca_resource.job_spec.network = network
|
|
|
|
if (
|
|
timeout
|
|
or restart_job_on_worker_restart
|
|
or disable_retries
|
|
or scheduling_strategy
|
|
or max_wait_duration
|
|
):
|
|
timeout = duration_pb2.Duration(seconds=timeout) if timeout else None
|
|
max_wait_duration = (
|
|
duration_pb2.Duration(seconds=max_wait_duration)
|
|
if max_wait_duration
|
|
else None
|
|
)
|
|
self._gca_resource.job_spec.scheduling = gca_custom_job_compat.Scheduling(
|
|
timeout=timeout,
|
|
restart_job_on_worker_restart=restart_job_on_worker_restart,
|
|
disable_retries=disable_retries,
|
|
strategy=scheduling_strategy,
|
|
max_wait_duration=max_wait_duration,
|
|
)
|
|
|
|
if enable_web_access:
|
|
self._gca_resource.job_spec.enable_web_access = enable_web_access
|
|
|
|
if tensorboard:
|
|
self._gca_resource.job_spec.tensorboard = tensorboard
|
|
|
|
if persistent_resource_id:
|
|
self._gca_resource.job_spec.persistent_resource_id = persistent_resource_id
|
|
|
|
(
|
|
self._gca_resource.job_spec.experiment,
|
|
self._gca_resource.job_spec.experiment_run,
|
|
) = self._get_experiment_and_run_resource_name(experiment, experiment_run)
|
|
|
|
_LOGGER.log_create_with_lro(self.__class__)
|
|
|
|
self._gca_resource = self.api_client.create_custom_job(
|
|
parent=self._parent,
|
|
custom_job=self._gca_resource,
|
|
timeout=create_request_timeout,
|
|
)
|
|
|
|
_LOGGER.log_create_complete_with_getter(
|
|
self.__class__, self._gca_resource, "custom_job"
|
|
)
|
|
|
|
_LOGGER.info("View Custom Job:\n%s" % self._dashboard_uri())
|
|
|
|
if tensorboard:
|
|
_LOGGER.info(
|
|
"View Tensorboard:\n%s"
|
|
% console_utils.custom_job_tensorboard_console_uri(
|
|
tensorboard, self.resource_name
|
|
)
|
|
)
|
|
|
|
@property
|
|
def job_spec(self):
|
|
return self._gca_resource.job_spec
|
|
|
|
@staticmethod
|
|
def _get_experiment_and_run_resource_name(
|
|
experiment: Optional[Union["aiplatform.Experiment", str]] = None,
|
|
experiment_run: Optional[Union["aiplatform.ExperimentRun", str]] = None,
|
|
) -> Tuple[str, str]:
|
|
"""Helper method to get the experiment and run resource name for the custom job."""
|
|
if not experiment:
|
|
return None, None
|
|
|
|
experiment_resource = (
|
|
aiplatform.Experiment(experiment)
|
|
if isinstance(experiment, str)
|
|
else experiment
|
|
)
|
|
|
|
if not experiment_run:
|
|
return experiment_resource.resource_name, None
|
|
|
|
experiment_run_resource = (
|
|
aiplatform.ExperimentRun(experiment_run, experiment_resource)
|
|
if isinstance(experiment_run, str)
|
|
else experiment_run
|
|
)
|
|
return (
|
|
experiment_resource.resource_name,
|
|
experiment_run_resource.resource_name,
|
|
)
|
|
|
|
|
|
class HyperparameterTuningJob(_RunnableJob, base.PreviewMixin):
|
|
"""Vertex AI Hyperparameter Tuning Job."""
|
|
|
|
_resource_noun = "hyperparameterTuningJobs"
|
|
_getter_method = "get_hyperparameter_tuning_job"
|
|
_list_method = "list_hyperparameter_tuning_jobs"
|
|
_cancel_method = "cancel_hyperparameter_tuning_job"
|
|
_delete_method = "delete_hyperparameter_tuning_job"
|
|
_parse_resource_name_method = "parse_hyperparameter_tuning_job_path"
|
|
_format_resource_name_method = "hyperparameter_tuning_job_path"
|
|
_job_type = "training"
|
|
_preview_class = (
|
|
"google.cloud.aiplatform.aiplatform.preview.jobs.HyperparameterTuningJob"
|
|
)
|
|
|
|
def __init__(
|
|
self,
|
|
# TODO(b/223262536): Make display_name parameter fully optional in next major release
|
|
display_name: str,
|
|
custom_job: CustomJob,
|
|
metric_spec: Dict[str, str],
|
|
parameter_spec: Dict[str, hyperparameter_tuning._ParameterSpec],
|
|
max_trial_count: int,
|
|
parallel_trial_count: int,
|
|
max_failed_trial_count: int = 0,
|
|
search_algorithm: Optional[str] = None,
|
|
measurement_selection: Optional[str] = "best",
|
|
project: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
labels: Optional[Dict[str, str]] = None,
|
|
encryption_spec_key_name: Optional[str] = None,
|
|
):
|
|
"""
|
|
Configures a HyperparameterTuning Job.
|
|
|
|
Example usage:
|
|
|
|
```
|
|
from google.cloud.aiplatform import hyperparameter_tuning as hpt
|
|
|
|
worker_pool_specs = [
|
|
{
|
|
"machine_spec": {
|
|
"machine_type": "n1-standard-4",
|
|
"accelerator_type": "NVIDIA_TESLA_K80",
|
|
"accelerator_count": 1,
|
|
},
|
|
"replica_count": 1,
|
|
"container_spec": {
|
|
"image_uri": container_image_uri,
|
|
"command": [],
|
|
"args": [],
|
|
},
|
|
}
|
|
]
|
|
|
|
custom_job = aiplatform.CustomJob(
|
|
display_name='my_job',
|
|
worker_pool_specs=worker_pool_specs,
|
|
labels={'my_key': 'my_value'},
|
|
)
|
|
|
|
|
|
hp_job = aiplatform.HyperparameterTuningJob(
|
|
display_name='hp-test',
|
|
custom_job=job,
|
|
metric_spec={
|
|
'loss': 'minimize',
|
|
},
|
|
parameter_spec={
|
|
'lr': hpt.DoubleParameterSpec(min=0.001, max=0.1, scale='log'),
|
|
'units': hpt.IntegerParameterSpec(min=4, max=128, scale='linear'),
|
|
'activation': hpt.CategoricalParameterSpec(values=['relu', 'selu']),
|
|
'batch_size': hpt.DiscreteParameterSpec(values=[128, 256], scale='linear')
|
|
},
|
|
max_trial_count=128,
|
|
parallel_trial_count=8,
|
|
labels={'my_key': 'my_value'},
|
|
)
|
|
|
|
hp_job.run()
|
|
|
|
print(hp_job.trials)
|
|
```
|
|
|
|
|
|
For more information on using hyperparameter tuning please visit:
|
|
https://cloud.google.com/ai-platform-unified/docs/training/using-hyperparameter-tuning
|
|
|
|
Args:
|
|
display_name (str):
|
|
Required. The user-defined name of the HyperparameterTuningJob.
|
|
The name can be up to 128 characters long and can be consist
|
|
of any UTF-8 characters.
|
|
custom_job (aiplatform.CustomJob):
|
|
Required. Configured CustomJob. The worker pool spec from this custom job
|
|
applies to the CustomJobs created in all the trials. A persistent_resource_id can be
|
|
specified on the custom job to be used when running this Hyperparameter Tuning job.
|
|
metric_spec: Dict[str, str]
|
|
Required. Dictionary representing metrics to optimize. The dictionary key is the metric_id,
|
|
which is reported by your training job, and the dictionary value is the
|
|
optimization goal of the metric('minimize' or 'maximize'). example:
|
|
|
|
metric_spec = {'loss': 'minimize', 'accuracy': 'maximize'}
|
|
|
|
parameter_spec (Dict[str, hyperparameter_tuning._ParameterSpec]):
|
|
Required. Dictionary representing parameters to optimize. The dictionary key is the metric_id,
|
|
which is passed into your training job as a command line key word argument, and the
|
|
dictionary value is the parameter specification of the metric.
|
|
|
|
|
|
from google.cloud.aiplatform import hyperparameter_tuning as hpt
|
|
|
|
parameter_spec={
|
|
'decay': hpt.DoubleParameterSpec(min=1e-7, max=1, scale='linear'),
|
|
'learning_rate': hpt.DoubleParameterSpec(min=1e-7, max=1, scale='linear')
|
|
'batch_size': hpt.DiscreteParamterSpec(values=[4, 8, 16, 32, 64, 128], scale='linear')
|
|
}
|
|
|
|
Supported parameter specifications can be found until aiplatform.hyperparameter_tuning.
|
|
These parameter specification are currently supported:
|
|
DoubleParameterSpec, IntegerParameterSpec, CategoricalParameterSpace, DiscreteParameterSpec
|
|
|
|
max_trial_count (int):
|
|
Required. The desired total number of Trials.
|
|
parallel_trial_count (int):
|
|
Required. The desired number of Trials to run in parallel.
|
|
max_failed_trial_count (int):
|
|
Optional. The number of failed Trials that need to be
|
|
seen before failing the HyperparameterTuningJob.
|
|
If set to 0, Vertex AI decides how many Trials
|
|
must fail before the whole job fails.
|
|
search_algorithm (str):
|
|
The search algorithm specified for the Study.
|
|
Accepts one of the following:
|
|
`None` - If you do not specify an algorithm, your job uses
|
|
the default Vertex AI algorithm. The default algorithm
|
|
applies Bayesian optimization to arrive at the optimal
|
|
solution with a more effective search over the parameter space.
|
|
|
|
'grid' - A simple grid search within the feasible space. This
|
|
option is particularly useful if you want to specify a quantity
|
|
of trials that is greater than the number of points in the
|
|
feasible space. In such cases, if you do not specify a grid
|
|
search, the Vertex AI default algorithm may generate duplicate
|
|
suggestions. To use grid search, all parameter specs must be
|
|
of type `IntegerParameterSpec`, `CategoricalParameterSpace`,
|
|
or `DiscreteParameterSpec`.
|
|
|
|
'random' - A simple random search within the feasible space.
|
|
measurement_selection (str):
|
|
This indicates which measurement to use if/when the service
|
|
automatically selects the final measurement from previously reported
|
|
intermediate measurements.
|
|
|
|
Accepts: 'best', 'last'
|
|
|
|
Choose this based on two considerations:
|
|
A) Do you expect your measurements to monotonically improve? If so,
|
|
choose 'last'. On the other hand, if you're in a situation
|
|
where your system can "over-train" and you expect the performance to
|
|
get better for a while but then start declining, choose
|
|
'best'. B) Are your measurements significantly noisy
|
|
and/or irreproducible? If so, 'best' will tend to be
|
|
over-optimistic, and it may be better to choose 'last'. If
|
|
both or neither of (A) and (B) apply, it doesn't matter which
|
|
selection type is chosen.
|
|
project (str):
|
|
Optional. Project to run the HyperparameterTuningjob in. Overrides project set in aiplatform.init.
|
|
location (str):
|
|
Optional. Location to run the HyperparameterTuning in. Overrides location set in aiplatform.init.
|
|
credentials (auth_credentials.Credentials):
|
|
Optional. Custom credentials to use to run call HyperparameterTuning service. Overrides
|
|
credentials set in aiplatform.init.
|
|
labels (Dict[str, str]):
|
|
Optional. The labels with user-defined metadata to
|
|
organize HyperparameterTuningJobs.
|
|
Label keys and values can be no longer than 64
|
|
characters (Unicode codepoints), can only
|
|
contain lowercase letters, numeric characters,
|
|
underscores and dashes. International characters
|
|
are allowed.
|
|
See https://goo.gl/xmQnxf for more information
|
|
and examples of labels.
|
|
encryption_spec_key_name (str):
|
|
Optional. Customer-managed encryption key options for a
|
|
HyperparameterTuningJob. If this is set, then
|
|
all resources created by the
|
|
HyperparameterTuningJob will be encrypted with
|
|
the provided encryption key.
|
|
"""
|
|
super().__init__(project=project, location=location, credentials=credentials)
|
|
|
|
metrics = [
|
|
gca_study_compat.StudySpec.MetricSpec(
|
|
metric_id=metric_id, goal=goal.upper()
|
|
)
|
|
for metric_id, goal in metric_spec.items()
|
|
]
|
|
|
|
parameters = [
|
|
parameter._to_parameter_spec(parameter_id=parameter_id)
|
|
for parameter_id, parameter in parameter_spec.items()
|
|
]
|
|
|
|
study_spec = gca_study_compat.StudySpec(
|
|
metrics=metrics,
|
|
parameters=parameters,
|
|
algorithm=hyperparameter_tuning.SEARCH_ALGORITHM_TO_PROTO_VALUE[
|
|
search_algorithm
|
|
],
|
|
measurement_selection_type=hyperparameter_tuning.MEASUREMENT_SELECTION_TO_PROTO_VALUE[
|
|
measurement_selection
|
|
],
|
|
)
|
|
|
|
if not display_name:
|
|
display_name = self.__class__._generate_display_name()
|
|
|
|
self._gca_resource = (
|
|
gca_hyperparameter_tuning_job_compat.HyperparameterTuningJob(
|
|
display_name=display_name,
|
|
study_spec=study_spec,
|
|
max_trial_count=max_trial_count,
|
|
parallel_trial_count=parallel_trial_count,
|
|
max_failed_trial_count=max_failed_trial_count,
|
|
trial_job_spec=copy.deepcopy(custom_job.job_spec),
|
|
labels=labels,
|
|
encryption_spec=initializer.global_config.get_encryption_spec(
|
|
encryption_spec_key_name=encryption_spec_key_name
|
|
),
|
|
)
|
|
)
|
|
|
|
@property
|
|
def network(self) -> Optional[str]:
|
|
"""The full name of the Google Compute Engine
|
|
[network](https://cloud.google.com/vpc/docs/vpc#networks) to which this
|
|
HyperparameterTuningJob should be peered.
|
|
|
|
Takes the format `projects/{project}/global/networks/{network}`. Where
|
|
{project} is a project number, as in `12345`, and {network} is a network name.
|
|
|
|
Private services access must already be configured for the network. If left
|
|
unspecified, the HyperparameterTuningJob is not peered with any network.
|
|
"""
|
|
self._assert_gca_resource_is_available()
|
|
return getattr(self._gca_resource.trial_job_spec, "network")
|
|
|
|
def _get_web_access_uris(self) -> Dict[str, Dict[str, str]]:
|
|
"""Helper method to get the web access uris of the hyperparameter job
|
|
|
|
Returns:
|
|
(Dict[str, Dict[str, str]]):
|
|
Web access uris of the hyperparameter job.
|
|
"""
|
|
web_access_uris = dict()
|
|
for trial in self.trials:
|
|
web_access_uris[trial.id] = web_access_uris.get(trial.id, dict())
|
|
for worker, uri in trial.web_access_uris.items():
|
|
web_access_uris[trial.id][worker] = uri
|
|
return web_access_uris
|
|
|
|
def _log_web_access_uris(self):
|
|
"""Helper method to log the web access uris of the hyperparameter job"""
|
|
|
|
for trial_id, trial_web_access_uris in self._get_web_access_uris().items():
|
|
for worker, uri in trial_web_access_uris.items():
|
|
if uri not in self._logged_web_access_uris:
|
|
_LOGGER.info(
|
|
"%s %s access the interactive shell terminals for trial - %s:\n%s:\n%s"
|
|
% (
|
|
self.__class__.__name__,
|
|
self._gca_resource.name,
|
|
trial_id,
|
|
worker,
|
|
uri,
|
|
),
|
|
)
|
|
self._logged_web_access_uris.add(uri)
|
|
|
|
def run(
|
|
self,
|
|
service_account: Optional[str] = None,
|
|
network: Optional[str] = None,
|
|
timeout: Optional[int] = None, # seconds
|
|
restart_job_on_worker_restart: bool = False,
|
|
enable_web_access: bool = False,
|
|
tensorboard: Optional[str] = None,
|
|
sync: bool = True,
|
|
create_request_timeout: Optional[float] = None,
|
|
disable_retries: bool = False,
|
|
scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None,
|
|
max_wait_duration: Optional[int] = None, # seconds
|
|
) -> None:
|
|
"""Run this configured CustomJob.
|
|
|
|
Args:
|
|
service_account (str):
|
|
Optional. Specifies the service account for workload run-as account.
|
|
Users submitting jobs must have act-as permission on this run-as account.
|
|
network (str):
|
|
Optional. The full name of the Compute Engine network to which the job
|
|
should be peered. For example, projects/12345/global/networks/myVPC.
|
|
Private services access must already be configured for the network.
|
|
If left unspecified, the network set in aiplatform.init will be used.
|
|
Otherwise, the job is not peered with any network.
|
|
timeout (int):
|
|
Optional. The maximum job running time in seconds. The default is 7 days.
|
|
restart_job_on_worker_restart (bool):
|
|
Restarts the entire CustomJob if a worker
|
|
gets restarted. This feature can be used by
|
|
distributed training jobs that are not resilient
|
|
to workers leaving and joining a job.
|
|
enable_web_access (bool):
|
|
Whether you want Vertex AI to enable interactive shell access
|
|
to training containers.
|
|
https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell
|
|
tensorboard (str):
|
|
Optional. The name of a Vertex AI
|
|
[Tensorboard][google.cloud.aiplatform.v1beta1.Tensorboard]
|
|
resource to which this CustomJob will upload Tensorboard
|
|
logs. Format:
|
|
``projects/{project}/locations/{location}/tensorboards/{tensorboard}``
|
|
|
|
The training script should write Tensorboard to following Vertex AI environment
|
|
variable:
|
|
|
|
AIP_TENSORBOARD_LOG_DIR
|
|
|
|
`service_account` is required with provided `tensorboard`.
|
|
For more information on configuring your service account please visit:
|
|
https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
|
|
sync (bool):
|
|
Whether to execute this method synchronously. If False, this method
|
|
will unblock and it will be executed in a concurrent Future.
|
|
create_request_timeout (float):
|
|
Optional. The timeout for the create request in seconds.
|
|
disable_retries (bool):
|
|
Indicates if the job should retry for internal errors after the
|
|
job starts running. If True, overrides
|
|
`restart_job_on_worker_restart` to False.
|
|
scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy):
|
|
Optional. Indicates the job scheduling strategy.
|
|
max_wait_duration (int):
|
|
This is the maximum duration that a job will wait for the
|
|
requested resources to be provisioned in seconds. If set to 0,
|
|
the job will wait indefinitely. The default is 1 day.
|
|
"""
|
|
network = network or initializer.global_config.network
|
|
service_account = service_account or initializer.global_config.service_account
|
|
|
|
self._run(
|
|
service_account=service_account,
|
|
network=network,
|
|
timeout=timeout,
|
|
restart_job_on_worker_restart=restart_job_on_worker_restart,
|
|
enable_web_access=enable_web_access,
|
|
tensorboard=tensorboard,
|
|
sync=sync,
|
|
create_request_timeout=create_request_timeout,
|
|
disable_retries=disable_retries,
|
|
scheduling_strategy=scheduling_strategy,
|
|
max_wait_duration=max_wait_duration,
|
|
)
|
|
|
|
@base.optional_sync()
|
|
def _run(
|
|
self,
|
|
service_account: Optional[str] = None,
|
|
network: Optional[str] = None,
|
|
timeout: Optional[int] = None, # seconds
|
|
restart_job_on_worker_restart: bool = False,
|
|
enable_web_access: bool = False,
|
|
tensorboard: Optional[str] = None,
|
|
sync: bool = True,
|
|
create_request_timeout: Optional[float] = None,
|
|
disable_retries: bool = False,
|
|
scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None,
|
|
max_wait_duration: Optional[int] = None, # seconds
|
|
) -> None:
|
|
"""Helper method to ensure network synchronization and to run the configured CustomJob.
|
|
|
|
Args:
|
|
service_account (str):
|
|
Optional. Specifies the service account for workload run-as account.
|
|
Users submitting jobs must have act-as permission on this run-as account.
|
|
network (str):
|
|
Optional. The full name of the Compute Engine network to which the job
|
|
should be peered. For example, projects/12345/global/networks/myVPC.
|
|
Private services access must already be configured for the network.
|
|
timeout (int):
|
|
Optional. The maximum job running time in seconds. The default is 7 days.
|
|
restart_job_on_worker_restart (bool):
|
|
Restarts the entire CustomJob if a worker
|
|
gets restarted. This feature can be used by
|
|
distributed training jobs that are not resilient
|
|
to workers leaving and joining a job.
|
|
enable_web_access (bool):
|
|
Whether you want Vertex AI to enable interactive shell access
|
|
to training containers.
|
|
https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell
|
|
tensorboard (str):
|
|
Optional. The name of a Vertex AI
|
|
[Tensorboard][google.cloud.aiplatform.v1beta1.Tensorboard]
|
|
resource to which this CustomJob will upload Tensorboard
|
|
logs. Format:
|
|
``projects/{project}/locations/{location}/tensorboards/{tensorboard}``
|
|
|
|
The training script should write Tensorboard to following Vertex AI environment
|
|
variable:
|
|
|
|
AIP_TENSORBOARD_LOG_DIR
|
|
|
|
`service_account` is required with provided `tensorboard`.
|
|
For more information on configuring your service account please visit:
|
|
https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
|
|
sync (bool):
|
|
Whether to execute this method synchronously. If False, this method
|
|
will unblock and it will be executed in a concurrent Future.
|
|
create_request_timeout (float):
|
|
Optional. The timeout for the create request in seconds.
|
|
disable_retries (bool):
|
|
Indicates if the job should retry for internal errors after the
|
|
job starts running. If True, overrides
|
|
`restart_job_on_worker_restart` to False.
|
|
scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy):
|
|
Optional. Indicates the job scheduling strategy.
|
|
max_wait_duration (int):
|
|
This is the maximum duration that a job will wait for the
|
|
requested resources to be provisioned in seconds. If set to 0,
|
|
the job will wait indefinitely. The default is 1 day.
|
|
"""
|
|
if service_account:
|
|
self._gca_resource.trial_job_spec.service_account = service_account
|
|
|
|
if network:
|
|
self._gca_resource.trial_job_spec.network = network
|
|
|
|
if (
|
|
timeout
|
|
or restart_job_on_worker_restart
|
|
or disable_retries
|
|
or max_wait_duration
|
|
or scheduling_strategy
|
|
):
|
|
timeout = duration_pb2.Duration(seconds=timeout) if timeout else None
|
|
max_wait_duration = (
|
|
duration_pb2.Duration(seconds=max_wait_duration)
|
|
if max_wait_duration
|
|
else None
|
|
)
|
|
self._gca_resource.trial_job_spec.scheduling = (
|
|
gca_custom_job_compat.Scheduling(
|
|
timeout=timeout,
|
|
restart_job_on_worker_restart=restart_job_on_worker_restart,
|
|
disable_retries=disable_retries,
|
|
strategy=scheduling_strategy,
|
|
max_wait_duration=max_wait_duration,
|
|
)
|
|
)
|
|
|
|
if enable_web_access:
|
|
self._gca_resource.trial_job_spec.enable_web_access = enable_web_access
|
|
|
|
if tensorboard:
|
|
self._gca_resource.trial_job_spec.tensorboard = tensorboard
|
|
|
|
_LOGGER.log_create_with_lro(self.__class__)
|
|
|
|
self._gca_resource = self.api_client.create_hyperparameter_tuning_job(
|
|
parent=self._parent,
|
|
hyperparameter_tuning_job=self._gca_resource,
|
|
timeout=create_request_timeout,
|
|
)
|
|
|
|
_LOGGER.log_create_complete_with_getter(
|
|
self.__class__, self._gca_resource, "hpt_job"
|
|
)
|
|
|
|
_LOGGER.info("View HyperparameterTuningJob:\n%s" % self._dashboard_uri())
|
|
|
|
if tensorboard:
|
|
_LOGGER.info(
|
|
"View Tensorboard:\n%s"
|
|
% console_utils.custom_job_tensorboard_console_uri(
|
|
tensorboard, self.resource_name
|
|
)
|
|
)
|
|
|
|
self._block_until_complete()
|
|
|
|
@property
|
|
def trials(self) -> List[gca_study_compat.Trial]:
|
|
self._assert_gca_resource_is_available()
|
|
return list(self._gca_resource.trials)
|
|
|
|
|
|
class ModelDeploymentMonitoringJob(_Job):
|
|
"""Vertex AI Model Deployment Monitoring Job.
|
|
|
|
This class should be used in conjunction with the Endpoint class
|
|
in order to configure model monitoring for deployed models.
|
|
"""
|
|
|
|
_resource_noun = "modelDeploymentMonitoringJobs"
|
|
_getter_method = "get_model_deployment_monitoring_job"
|
|
_list_method = "list_model_deployment_monitoring_jobs"
|
|
_cancel_method = "cancel_model_deployment_monitoring_jobs"
|
|
_delete_method = "delete_model_deployment_monitoring_job"
|
|
_job_type = "model-deployment-monitoring"
|
|
_parse_resource_name_method = "parse_model_deployment_monitoring_job_path"
|
|
_format_resource_name_method = "model_deployment_monitoring_job_path"
|
|
|
|
def __init__(
|
|
self,
|
|
model_deployment_monitoring_job_name: str,
|
|
project: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
):
|
|
"""Initializer for ModelDeploymentMonitoringJob.
|
|
|
|
Args:
|
|
model_deployment_monitoring_job_name (str):
|
|
Required. A fully-qualified ModelDeploymentMonitoringJob resource name or ID.
|
|
Example: "projects/.../locations/.../modelDeploymentMonitoringJobs/456" or
|
|
"456" when project and location are initialized or passed.
|
|
project: (str),
|
|
Optional. project to retrieve ModelDeploymentMonitoringJob from. If not set,
|
|
project set in aiplatform.init will be used.
|
|
location: (str),
|
|
Optional. location to retrieve ModelDeploymentMonitoringJob from. If not set,
|
|
location set in aiplatform.init will be used.
|
|
credentials: (auth_credentials.Credentials),
|
|
Optional. Custom credentials to use. If not set, credentials set in
|
|
aiplatform.init will be used.
|
|
"""
|
|
super().__init__(
|
|
job_name=model_deployment_monitoring_job_name,
|
|
project=project,
|
|
location=location,
|
|
credentials=credentials,
|
|
)
|
|
self._gca_resource = self._get_gca_resource(
|
|
resource_name=model_deployment_monitoring_job_name
|
|
)
|
|
|
|
@classmethod
|
|
def _parse_configs(
|
|
cls,
|
|
objective_configs: Union[
|
|
model_monitoring.ObjectiveConfig,
|
|
Dict[str, model_monitoring.ObjectiveConfig],
|
|
],
|
|
endpoint: "aiplatform.Endpoint",
|
|
deployed_model_ids: Optional[List[str]] = None,
|
|
) -> List[
|
|
gca_model_deployment_monitoring_job_compat.ModelDeploymentMonitoringObjectiveConfig
|
|
]:
|
|
"""Helper function for matching objective configs with their corresponding models.
|
|
|
|
Args:
|
|
objective_configs (Union[model_monitoring.objective.ObjectiveConfig,
|
|
Dict[str, model_monitoring.objective.ObjectiveConfig]):
|
|
Required. A single config if it applies to all models, or a dictionary of
|
|
model_id: model_monitoring.objective.ObjectiveConfig if
|
|
different model IDs have different configs.
|
|
endpoint (aiplatform.Endpoint):
|
|
Required. A valid instance of aiplatforn.Endpoint to launch the MDM job on.
|
|
deployed_model_ids (Optional[List[str]]):
|
|
Optional. A list of deployed model IDs to apply the objective config to.
|
|
Note that a model will have a deployed_model_id that is different from the
|
|
uploaded model ID, and IDs in this list should consist of deployed model IDs
|
|
on the same endpoint passed in the argument. If `objective_configs` is a dictionary,
|
|
then this parameter is ignored. If `objective_configs` is an instance of
|
|
`model_monitoring.ObjectiveConfig` and `deployed_model_ids` is a non-empty
|
|
list of valid IDs, then the same objective config will apply to all models in this list.
|
|
|
|
Returns:
|
|
A List of ModelDeploymentMonitoringObjectiveConfig objects.
|
|
|
|
Raises:
|
|
ValueError:
|
|
When the model IDs given are invalid.
|
|
RuntimeError:
|
|
When XAI is enabled on a model that doesn't have XAI parameters
|
|
configured.
|
|
"""
|
|
all_models = []
|
|
xai_enabled = []
|
|
for model in endpoint.list_models():
|
|
all_models.append(model.id)
|
|
if str(model.explanation_spec.parameters) != "":
|
|
xai_enabled.append(model.id)
|
|
|
|
all_configs = []
|
|
|
|
## when same objective config is applied to SOME or ALL models
|
|
if deployed_model_ids is not None:
|
|
if not all(model in all_models for model in deployed_model_ids):
|
|
error_string = (
|
|
"Invalid model ID. The model ID must be one of ["
|
|
+ ",".join(all_models)
|
|
+ "]. Note that deployed model IDs are different from the uploaded model's ID"
|
|
)
|
|
raise ValueError(error_string)
|
|
else:
|
|
all_models = deployed_model_ids
|
|
|
|
if isinstance(objective_configs, model_monitoring.ObjectiveConfig):
|
|
for model in all_models:
|
|
if (
|
|
model not in xai_enabled
|
|
and objective_configs.explanation_config is not None
|
|
):
|
|
raise RuntimeError(
|
|
"Invalid config for model ID %s. `explanation_config` should only be enabled if the model has `explanation_spec populated"
|
|
% model
|
|
)
|
|
all_configs.append(
|
|
gca_model_deployment_monitoring_job_compat.ModelDeploymentMonitoringObjectiveConfig(
|
|
deployed_model_id=model,
|
|
objective_config=objective_configs.as_proto(),
|
|
)
|
|
)
|
|
|
|
## when different objective configs are applied to EACH model
|
|
else:
|
|
if not all(model in all_models for model in objective_configs.keys()):
|
|
error_string = (
|
|
"Invalid model ID. The model ID must be one of ["
|
|
+ ",".join(all_models)
|
|
+ "]. Note that deployed model IDs are different from the uploaded model's ID"
|
|
)
|
|
raise ValueError(error_string)
|
|
for (deployed_model, objective_config) in objective_configs.items():
|
|
if (
|
|
deployed_model not in xai_enabled
|
|
and objective_config.explanation_config is not None
|
|
):
|
|
raise RuntimeError(
|
|
"Invalid config for model ID %s. `explanation_config` should only be enabled if the model has `explanation_spec populated"
|
|
% deployed_model
|
|
)
|
|
all_configs.append(
|
|
gca_model_deployment_monitoring_job_compat.ModelDeploymentMonitoringObjectiveConfig(
|
|
deployed_model_id=deployed_model,
|
|
objective_config=objective_config.as_proto(),
|
|
)
|
|
)
|
|
|
|
return all_configs
|
|
|
|
@classmethod
|
|
def create(
|
|
cls,
|
|
endpoint: Union[str, "aiplatform.Endpoint"],
|
|
objective_configs: Optional[
|
|
Union[
|
|
model_monitoring.ObjectiveConfig,
|
|
Dict[str, model_monitoring.ObjectiveConfig],
|
|
]
|
|
] = None,
|
|
logging_sampling_strategy: Optional[model_monitoring.RandomSampleConfig] = None,
|
|
schedule_config: Optional[model_monitoring.ScheduleConfig] = None,
|
|
display_name: Optional[str] = None,
|
|
deployed_model_ids: Optional[List[str]] = None,
|
|
alert_config: Optional[model_monitoring.EmailAlertConfig] = None,
|
|
predict_instance_schema_uri: Optional[str] = None,
|
|
sample_predict_instance: Optional[str] = None,
|
|
analysis_instance_schema_uri: Optional[str] = None,
|
|
bigquery_tables_log_ttl: Optional[int] = None,
|
|
stats_anomalies_base_directory: Optional[str] = None,
|
|
enable_monitoring_pipeline_logs: Optional[bool] = None,
|
|
labels: Optional[Dict[str, str]] = None,
|
|
encryption_spec_key_name: Optional[str] = None,
|
|
project: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
credentials: Optional[auth_credentials.Credentials] = None,
|
|
create_request_timeout: Optional[float] = None,
|
|
) -> "ModelDeploymentMonitoringJob":
|
|
"""Creates and launches a model monitoring job.
|
|
|
|
Args:
|
|
endpoint (Union[str, "aiplatform.Endpoint"]):
|
|
Required. Endpoint resource name or an instance of `aiplatform.Endpoint`. Format:
|
|
``projects/{project}/locations/{location}/endpoints/{endpoint}``
|
|
|
|
objective_configs (Union[model_monitoring.ObjectiveConfig,
|
|
Dict[str, model_monitoring.ObjectiveConfig]]):
|
|
Required. A single config if it applies to all models, or a dictionary of
|
|
model_id: model_monitoring.objective.ObjectiveConfig if
|
|
different model IDs have different configs.
|
|
|
|
logging_sampling_strategy (model_monitoring.sampling.RandomSampleConfig):
|
|
Optional. Sample Strategy for logging.
|
|
|
|
schedule_config (model_monitoring.schedule.ScheduleConfig):
|
|
Optional. Configures model monitoring job scheduling interval in hours.
|
|
This defines how often the monitoring jobs are triggered.
|
|
|
|
display_name (str):
|
|
Optional. The user-defined name of the
|
|
ModelDeploymentMonitoringJob. The name can be up
|
|
to 128 characters long and can be consist of any
|
|
UTF-8 characters.
|
|
Display name of a ModelDeploymentMonitoringJob.
|
|
|
|
deployed_model_ids (List[str]):
|
|
Optional. Use this argument to specify which deployed models to
|
|
apply the objective config to. If left unspecified, the same config
|
|
will be applied to all deployed models.
|
|
|
|
alert_config (model_monitoring.alert.EmailAlertConfig):
|
|
Optional. Configures how alerts are sent to the user. Right now
|
|
only email alert is supported.
|
|
|
|
predict_instance_schema_uri (str):
|
|
Optional. YAML schema file uri describing the format of
|
|
a single instance, which are given to format
|
|
the Endpoint's prediction (and explanation). If
|
|
not set, the schema will be generated from
|
|
collected predict requests.
|
|
|
|
sample_predict_instance (str):
|
|
Optional. Sample Predict instance, same format as PredictionRequest.instances,
|
|
this can be set as a replacement of predict_instance_schema_uri
|
|
If not set, the schema will be generated from collected predict requests.
|
|
|
|
analysis_instance_schema_uri (str):
|
|
Optional. YAML schema file uri describing the format of a single
|
|
instance that you want Tensorflow Data Validation (TFDV) to
|
|
analyze. If this field is empty, all the feature data types are
|
|
inferred from predict_instance_schema_uri, meaning that TFDV
|
|
will use the data in the exact format as prediction request/response.
|
|
If there are any data type differences between predict instance
|
|
and TFDV instance, this field can be used to override the schema.
|
|
For models trained with Vertex AI, this field must be set as all the
|
|
fields in predict instance formatted as string.
|
|
|
|
bigquery_tables_log_ttl (int):
|
|
Optional. The TTL(time to live) of BigQuery tables in user projects
|
|
which stores logs. A day is the basic unit of
|
|
the TTL and we take the ceil of TTL/86400(a
|
|
day). e.g. { second: 3600} indicates ttl = 1
|
|
day.
|
|
|
|
stats_anomalies_base_directory (str):
|
|
Optional. Stats anomalies base folder path.
|
|
|
|
enable_monitoring_pipeline_logs (bool):
|
|
Optional. If true, the scheduled monitoring pipeline logs are sent to
|
|
Google Cloud Logging, including pipeline status and
|
|
anomalies detected. Please note the logs incur cost, which
|
|
are subject to `Cloud Logging
|
|
pricing <https://cloud.google.com/logging#pricing>`__.
|
|
|
|
labels (Dict[str, str]):
|
|
Optional. The labels with user-defined metadata to
|
|
organize the ModelDeploymentMonitoringJob.
|
|
Label keys and values can be no longer than 64
|
|
characters (Unicode codepoints), can only
|
|
contain lowercase letters, numeric characters,
|
|
underscores and dashes. International characters
|
|
are allowed. See https://goo.gl/xmQnxf for more information
|
|
and examples of labels.
|
|
|
|
encryption_spec_key_name (str):
|
|
Optional. Customer-managed encryption key spec for a
|
|
ModelDeploymentMonitoringJob. If set, this
|
|
ModelDeploymentMonitoringJob and all
|
|
sub-resources of this
|
|
ModelDeploymentMonitoringJob will be secured by
|
|
this key.
|
|
|
|
create_request_timeout (int):
|
|
Optional. Timeout in seconds for the model monitoring job creation request.
|
|
|
|
Returns:
|
|
An instance of ModelDeploymentMonitoringJob.
|
|
"""
|
|
if not display_name:
|
|
display_name = cls._generate_display_name()
|
|
|
|
utils.validate_display_name(display_name)
|
|
|
|
if labels:
|
|
utils.validate_labels(labels)
|
|
|
|
if stats_anomalies_base_directory:
|
|
stats_anomalies_base_directory = gca_io_compat.GcsDestination(
|
|
output_uri_prefix=stats_anomalies_base_directory
|
|
)
|
|
|
|
if encryption_spec_key_name:
|
|
encryption_spec_key_name = gca_encryption_spec_compat.EncryptionSpec(
|
|
kms_key_name=encryption_spec_key_name
|
|
)
|
|
|
|
if credentials is None and isinstance(endpoint, aiplatform.Endpoint):
|
|
credentials = endpoint.credentials
|
|
self = cls._empty_constructor(
|
|
project=project, location=location, credentials=credentials
|
|
)
|
|
|
|
parent = initializer.global_config.common_location_path(
|
|
project=self.project,
|
|
location=self.location,
|
|
)
|
|
|
|
if isinstance(endpoint, str):
|
|
endpoint = aiplatform.Endpoint(endpoint, project, location, credentials)
|
|
|
|
mdm_objective_config_seq = cls._parse_configs(
|
|
objective_configs,
|
|
endpoint,
|
|
deployed_model_ids,
|
|
)
|
|
|
|
gapic_mdm_job = (
|
|
gca_model_deployment_monitoring_job_compat.ModelDeploymentMonitoringJob(
|
|
display_name=display_name,
|
|
endpoint=endpoint.resource_name,
|
|
model_deployment_monitoring_objective_configs=mdm_objective_config_seq,
|
|
logging_sampling_strategy=logging_sampling_strategy.as_proto(),
|
|
model_deployment_monitoring_schedule_config=schedule_config.as_proto(),
|
|
model_monitoring_alert_config=alert_config.as_proto(),
|
|
predict_instance_schema_uri=predict_instance_schema_uri,
|
|
analysis_instance_schema_uri=analysis_instance_schema_uri,
|
|
sample_predict_instance=sample_predict_instance,
|
|
stats_anomalies_base_directory=stats_anomalies_base_directory,
|
|
enable_monitoring_pipeline_logs=enable_monitoring_pipeline_logs,
|
|
labels=labels,
|
|
encryption_spec=encryption_spec_key_name,
|
|
)
|
|
)
|
|
|
|
_LOGGER.log_create_with_lro(cls)
|
|
self._gca_resource = self.api_client.create_model_deployment_monitoring_job(
|
|
parent=parent,
|
|
model_deployment_monitoring_job=gapic_mdm_job,
|
|
timeout=create_request_timeout,
|
|
)
|
|
|
|
_LOGGER.log_create_complete(cls, self._gca_resource, "mdm_job")
|
|
|
|
_LOGGER.info(
|
|
"View Model Deployment Monitoring Job:\n%s" % self._dashboard_uri()
|
|
)
|
|
|
|
return self
|
|
|
|
@classmethod
|
|
def cancel(cls):
|
|
raise NotImplementedError(
|
|
"Cancel method is not implemented because it is not applicable. A running model deployment monitoring job can be paused or deleted."
|
|
)
|
|
|
|
@property
|
|
def end_time(self):
|
|
_LOGGER.info(
|
|
"Model deployment monitoring jobs do not have an end time since their inactive states are either PAUSED or PENDING."
|
|
)
|
|
return None
|
|
|
|
def update(
|
|
self,
|
|
*,
|
|
display_name: Optional[str] = None,
|
|
schedule_config: Optional[model_monitoring.ScheduleConfig] = None,
|
|
alert_config: Optional[model_monitoring.EmailAlertConfig] = None,
|
|
logging_sampling_strategy: Optional[model_monitoring.RandomSampleConfig] = None,
|
|
labels: Optional[Dict[str, str]] = None,
|
|
bigquery_tables_log_ttl: Optional[int] = None,
|
|
enable_monitoring_pipeline_logs: Optional[bool] = None,
|
|
objective_configs: Optional[
|
|
Union[
|
|
model_monitoring.ObjectiveConfig,
|
|
Dict[str, model_monitoring.ObjectiveConfig],
|
|
]
|
|
] = None,
|
|
deployed_model_ids: Optional[List[str]] = None,
|
|
update_request_timeout: Optional[float] = None,
|
|
) -> "ModelDeploymentMonitoringJob":
|
|
"""Updates an existing ModelDeploymentMonitoringJob.
|
|
|
|
Args:
|
|
|
|
display_name (str):
|
|
Optional. The user-defined name of the
|
|
ModelDeploymentMonitoringJob. The name can be up
|
|
to 128 characters long and can be consist of any
|
|
UTF-8 characters.
|
|
Display name of a ModelDeploymentMonitoringJob.
|
|
|
|
schedule_config (model_monitoring.schedule.ScheduleConfig):
|
|
Required. Configures model monitoring job scheduling interval in hours.
|
|
This defines how often the monitoring jobs are triggered.
|
|
alert_config (model_monitoring.alert.EmailAlertConfig):
|
|
Optional. Configures how alerts are sent to the user. Right now
|
|
only email alert is supported.
|
|
logging_sampling_strategy (model_monitoring.sampling.RandomSampleConfig):
|
|
Required. Sample Strategy for logging.
|
|
|
|
labels (Dict[str, str]):
|
|
Optional. The labels with user-defined metadata to
|
|
organize the ModelDeploymentMonitoringJob.
|
|
Label keys and values can be no longer than 64
|
|
characters (Unicode codepoints), can only
|
|
contain lowercase letters, numeric characters,
|
|
underscores and dashes. International characters
|
|
are allowed. See https://goo.gl/xmQnxf for more information
|
|
and examples of labels.
|
|
bigquery_tables_log_ttl (int):
|
|
Optional. The number of days for which the logs are stored.
|
|
The TTL(time to live) of BigQuery tables in user projects
|
|
which stores logs. A day is the basic unit of
|
|
the TTL and we take the ceil of TTL/86400(a
|
|
day). e.g. { second: 3600} indicates ttl = 1
|
|
day.
|
|
|
|
enable_monitoring_pipeline_logs (bool):
|
|
Optional. If true, the scheduled monitoring pipeline logs are sent to
|
|
Google Cloud Logging, including pipeline status and
|
|
anomalies detected. Please note the logs incur cost, which
|
|
are subject to `Cloud Logging
|
|
pricing <https://cloud.google.com/logging#pricing>`__.
|
|
|
|
objective_configs (Union[
|
|
Required. model_monitoring.objective.ObjectiveConfig,
|
|
Dict[str, model_monitoring.objective.ObjectiveConfig]):
|
|
A single config if it applies to all models, or a dictionary of
|
|
model_id: model_monitoring.objective.ObjectiveConfig if
|
|
different model IDs have different configs.
|
|
|
|
deployed_model_ids (List[str]):
|
|
Optional. Use this argument to specify which deployed models to
|
|
apply the updated objective config to. If left unspecified, the same config
|
|
will be applied to all deployed models.
|
|
upate_request_timeout (float):
|
|
Optional. Timeout in seconds for the model monitoring job update request.
|
|
"""
|
|
self._sync_gca_resource()
|
|
current_job = copy.deepcopy(self._gca_resource)
|
|
update_mask: List[str] = []
|
|
if display_name is not None:
|
|
update_mask.append("display_name")
|
|
current_job.display_name = display_name
|
|
if schedule_config is not None:
|
|
update_mask.append("model_deployment_monitoring_schedule_config")
|
|
current_job.model_deployment_monitoring_schedule_config = (
|
|
schedule_config.as_proto()
|
|
)
|
|
if alert_config is not None:
|
|
update_mask.append("model_monitoring_alert_config")
|
|
current_job.model_monitoring_alert_config = alert_config.as_proto()
|
|
if logging_sampling_strategy is not None:
|
|
update_mask.append("logging_sampling_strategy")
|
|
current_job.logging_sampling_strategy = logging_sampling_strategy.as_proto()
|
|
if labels is not None:
|
|
update_mask.append("labels")
|
|
current_job.labels = labels
|
|
if bigquery_tables_log_ttl is not None:
|
|
update_mask.append("log_ttl")
|
|
current_job.log_ttl = duration_pb2.Duration(
|
|
seconds=bigquery_tables_log_ttl * 86400
|
|
)
|
|
if enable_monitoring_pipeline_logs is not None:
|
|
update_mask.append("enable_monitoring_pipeline_logs")
|
|
current_job.enable_monitoring_pipeline_logs = (
|
|
enable_monitoring_pipeline_logs
|
|
)
|
|
if objective_configs is not None:
|
|
update_mask.append("model_deployment_monitoring_objective_configs")
|
|
current_job.model_deployment_monitoring_objective_configs = (
|
|
ModelDeploymentMonitoringJob._parse_configs(
|
|
objective_configs=objective_configs,
|
|
endpoint=aiplatform.Endpoint(
|
|
current_job.endpoint, credentials=self.credentials
|
|
),
|
|
deployed_model_ids=deployed_model_ids,
|
|
)
|
|
)
|
|
# TODO(b/254285776): add optional_sync support to model monitoring job
|
|
lro = self.api_client.update_model_deployment_monitoring_job(
|
|
model_deployment_monitoring_job=current_job,
|
|
update_mask=field_mask_pb2.FieldMask(paths=update_mask),
|
|
timeout=update_request_timeout,
|
|
)
|
|
self._gca_resource = lro.result(timeout=None)
|
|
return self
|
|
|
|
def pause(self) -> "ModelDeploymentMonitoringJob":
|
|
"""Pause a running MDM job."""
|
|
self.api_client.pause_model_deployment_monitoring_job(
|
|
name=self._gca_resource.name
|
|
)
|
|
return self
|
|
|
|
def resume(self) -> "ModelDeploymentMonitoringJob":
|
|
"""Resumes a paused MDM job."""
|
|
self.api_client.resume_model_deployment_monitoring_job(
|
|
name=self._gca_resource.name
|
|
)
|
|
return self
|
|
|
|
def delete(self) -> None:
|
|
"""Deletes an MDM job."""
|
|
self.api_client.delete_model_deployment_monitoring_job(
|
|
name=self._gca_resource.name
|
|
)
|