structure saas with tools

This commit is contained in:
Davidson Gomes
2025-04-25 15:30:54 -03:00
commit 1aef473937
16434 changed files with 6584257 additions and 0 deletions

View File

@@ -0,0 +1,47 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Classes for working with language models."""
from vertexai.language_models._language_models import (
ChatMessage,
ChatModel,
ChatSession,
CodeChatModel,
CodeChatSession,
CodeGenerationModel,
InputOutputTextPair,
TextEmbedding,
TextEmbeddingInput,
TextEmbeddingModel,
TextGenerationModel,
TextGenerationResponse,
GroundingSource,
)
__all__ = [
"ChatMessage",
"ChatModel",
"ChatSession",
"CodeChatModel",
"CodeChatSession",
"CodeGenerationModel",
"InputOutputTextPair",
"TextEmbedding",
"TextEmbeddingInput",
"TextEmbeddingModel",
"TextGenerationModel",
"TextGenerationResponse",
"GroundingSource",
]

View File

@@ -0,0 +1,141 @@
from typing import Optional, Union
from google.cloud import aiplatform
from google.cloud.aiplatform import initializer as aiplatform_initializer
from vertexai.language_models import _language_models
from vertexai.language_models import _language_models as tuning
_DISTILLATION_PIPELINE_URI = (
"https://us-kfp.pkg.dev/ml-pipeline/distillation/distillation/v1.0.0"
)
class DistillationMixin:
def distill_from(
self,
*,
dataset: str,
teacher_model: Union[str, _language_models._TextGenerationModel],
train_steps: Optional[int] = None,
learning_rate_multiplier: Optional[float] = None,
evaluation_spec: Optional[tuning.TuningEvaluationSpec] = None,
accelerator_type: Optional[tuning._ACCELERATOR_TYPE_TYPE] = None,
model_display_name: Optional[str] = None,
max_context_length: Optional[str] = None,
):
"""Tunes a smaller model with help from another bigger model.
Args:
dataset: A URI pointing to data in JSON lines format.
teacher_model: The teacher model to use for distillation.
train_steps: Number of training batches to use (batch size is 8 samples).
learning_rate_multiplier: Learning rate multiplier to use in tuning.
evaluation_spec: Specification for the model evaluation during tuning.
accelerator_type: Type of accelerator to use. Can be "TPU" or "GPU".
model_display_name: Custom display name for the tuned model.
max_context_length: The max context length used for tuning.
Can be either '8k' or '32k'
Returns:
A tuning job for distillation.
Raises:
RuntimeError: If the model does not support distillation.
"""
if "/models/" not in self._endpoint_name:
raise RuntimeError(
f"Model does not support distillation: {self._endpoint_name}"
)
student_short_model_id = self._endpoint_name.split("/")[-1]
if isinstance(teacher_model, str):
teacher_short_model_id = teacher_model
elif isinstance(teacher_model, _language_models._LanguageModel):
if "/models/" not in teacher_model._endpoint_name:
raise RuntimeError(
f"Teacher model does not support distillation: {teacher_model._endpoint_name}"
)
teacher_short_model_id = teacher_model._endpoint_name.split("/")[-1]
else:
raise RuntimeError(f"Unsupported teacher model type: {teacher_model}")
pipeline_job = submit_distillation_pipeline_job(
teacher_model=teacher_short_model_id,
student_model=student_short_model_id,
dataset=dataset,
train_steps=train_steps,
learning_rate_multiplier=learning_rate_multiplier,
evaluation_spec=evaluation_spec,
accelerator_type=accelerator_type,
model_display_name=model_display_name,
max_context_length=max_context_length,
)
tuning_job = tuning._LanguageModelTuningJob(
base_model=self,
job=pipeline_job,
)
return tuning_job
def submit_distillation_pipeline_job(
*,
teacher_model: str,
student_model: str,
dataset: str,
train_steps: Optional[int] = None,
learning_rate_multiplier: Optional[float] = None,
evaluation_spec: Optional[tuning.TuningEvaluationSpec] = None,
accelerator_type: Optional[tuning._ACCELERATOR_TYPE_TYPE] = None,
model_display_name: Optional[str] = None,
max_context_length: Optional[str] = None,
):
teacher_short_model_id = teacher_model.split("/")[-1]
student_short_model_id = student_model.split("/")[-1]
pipeline_arguments = {
"teacher_model_reference": teacher_model,
"student_model_reference": student_model,
"dataset_uri": dataset,
"project": aiplatform_initializer.global_config.project,
"location": aiplatform_initializer.global_config.location,
}
if train_steps is not None:
pipeline_arguments["train_steps"] = train_steps
if learning_rate_multiplier is not None:
pipeline_arguments["learning_rate_multiplier"] = learning_rate_multiplier
if evaluation_spec is not None:
pipeline_arguments["evaluation_data_uri"] = evaluation_spec.evaluation_data
pipeline_arguments["evaluation_interval"] = evaluation_spec.evaluation_interval
pipeline_arguments[
"enable_early_stopping"
] = evaluation_spec.enable_early_stopping
pipeline_arguments[
"enable_checkpoint_selection"
] = evaluation_spec.enable_checkpoint_selection
pipeline_arguments["tensorboard_resource_id"] = evaluation_spec.tensorboard
# pipeline_parameter_values["evaluation_output_root_dir"] = ...
if accelerator_type is not None:
pipeline_arguments["accelerator_type"] = accelerator_type
if aiplatform_initializer.global_config.encryption_spec_key_name is not None:
pipeline_arguments[
"encryption_spec_key_name"
] = aiplatform_initializer.global_config.encryption_spec_key_name
if max_context_length is not None:
pipeline_arguments["max_context_length"] = max_context_length
if model_display_name is None:
model_display_name = (
f"{student_short_model_id} distilled from {teacher_short_model_id}"
)
pipeline_arguments["model_display_name"] = model_display_name
# # Not exposing these parameters:
# temperature: Optional[float] = None,
# tpu_training_skip_cmek: Optional[bool] = None,
# api_endpoint: Optional[str] = None,
# version: Optional[str] = None,
pipeline_job = aiplatform.PipelineJob(
template_path=_DISTILLATION_PIPELINE_URI,
display_name=None,
parameter_values=pipeline_arguments,
)
pipeline_job.submit()
return pipeline_job

View File

@@ -0,0 +1,754 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Classes for working with language models."""
import dataclasses
import os
from typing import Any, Dict, List, Optional, Type, TypeVar, Union
from google.cloud import storage
from google.cloud import aiplatform
from google.cloud.aiplatform import base
from google.cloud.aiplatform import initializer as aiplatform_initializer
from google.cloud.aiplatform import utils as aiplatform_utils
from google.cloud.aiplatform.utils import gcs_utils
from vertexai._model_garden import _model_garden_models
from google.cloud.aiplatform.compat.services import (
model_garden_service_client,
)
from google.cloud.aiplatform.compat.types import (
pipeline_state as gca_pipeline_state,
)
try:
import pandas
except ImportError:
pandas = None
_LOGGER = base.Logger(__name__)
# Model Evaluation constants
_TEXT_CLASSIFICATION_TASK_NAME = "text-classification"
_TEXT_GENERATION_TASK_NAME = "text-generation"
_QA_TASK_NAME = "question-answering"
_SUMMARIZATION_TASK_NAME = "summarization"
_EVALUATION_TASKS = frozenset(
[
_TEXT_CLASSIFICATION_TASK_NAME,
_TEXT_GENERATION_TASK_NAME,
_QA_TASK_NAME,
_SUMMARIZATION_TASK_NAME,
]
)
_TEXT_CLASSIFICATION_TEMPLATE_URL = "https://us-kfp.pkg.dev/vertex-evaluation/pipeline-templates/evaluation-llm-classification-pipeline"
_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL = "https://us-kfp.pkg.dev/vertex-evaluation/pipeline-templates/evaluation-llm-text-generation-pipeline"
_EVALUATION_TEMPLATE_VERSION_TAG = "2.9.0"
_EVALUATION_TEMPLATE_URLS = {
_TEXT_CLASSIFICATION_TASK_NAME: f"{_TEXT_CLASSIFICATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}",
_TEXT_GENERATION_TASK_NAME: f"{_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}",
_QA_TASK_NAME: f"{_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}",
_SUMMARIZATION_TASK_NAME: f"{_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}",
}
_EVALUATION_PIPELINE_COMPONENT_IDENTIFIER = "fpc-llm-evaluation"
_BATCH_PREDICTION_ROW_LIMIT = 30000
_EVAL_SUPPORTED_BASE_MODELS = ["text-bison@001", "text-bison@002"]
T = TypeVar("T", bound="_EvaluationMetricBase")
def _check_dataset_is_within_size_limit(
data: "pandas.DataFrame",
) -> None:
if len(data) < _BATCH_PREDICTION_ROW_LIMIT:
return
raise ValueError(
f"Your evaluation dataset size exceeds the limit of {_BATCH_PREDICTION_ROW_LIMIT}"
)
def _get_model_resource_name_and_validate(
model_name: str,
model_info: _model_garden_models._ModelInfo,
) -> str:
"""Returns the resource name string for the model.
Model Registry resource names will stay the same. For Publisher Models, we need to
pass the full resource name (publishers/google/models/text-bison@001) to the evaluation
template and ensure the base model supports evaluation.
Args:
model_name (str):
Required. The full resource name of the Model Registry model or base publisher model
to run evaluation on.
model_info (_model_garden_models._ModelInfo):
Required. The _ModelInfo object for the instance.
Returns:
The formatted model_name string.
Raises:
ValueError
If a base PublisherModel was provided and the model doesn't support evaluation.
"""
if "publishers/" not in model_name:
# Model Registry resource
return model_name
else:
if model_info.tuning_model_id in _EVAL_SUPPORTED_BASE_MODELS:
return f"{model_info.publisher_model_resource.name}@{model_info.publisher_model_resource.version_id}"
raise ValueError(
f"The provided model {model_name} does not support evaluation."
)
def _get_template_url(task_name: str) -> Optional[str]:
"""Returns the pipeline template to use for the evaluation task.
Args:
task_name (str):
Required. The name of the evaluation task to run.
Returns:
The evaluation pipeline template path.
"""
return _EVALUATION_TEMPLATE_URLS.get(task_name)
@dataclasses.dataclass
class _EvaluationTaskSpec:
"""Base class for task-specific model evaluation configuration parameters.
This class should not be instantiated directly, instead use the subclass corresponding
to your evaluation task.
Args:
ground_truth_data (Union[List[str], str, pandas.DataFrame]):
Required. The ground truth data to use for this evaluation job. This can be
either a Pandas DataFrame, a Cloud Storage URI of your JSONL data file, or a list of multiple
JSONL files on Cloud Storage.
Raises:
ValueError:
If task_spec.ground_truth_data is formatted incorrectly.
If task_spec.ground_truth_data is a Pandas DataFrame and exceeds 1000 rows.
If task_spec.ground_truth_data is not a string, list, or Pandas DataFrame.
"""
ground_truth_data: Union[List[str], str, "pandas.DataFrame"]
@property
def task_name(self) -> str:
pass
def __post_init__(self):
if isinstance(self.ground_truth_data, str):
self.ground_truth_data = [self.ground_truth_data]
if isinstance(self.ground_truth_data, list) and not all(
item.startswith("gs://") for item in self.ground_truth_data
):
raise ValueError("Please provide a valid GCS URI starting with 'gs://'")
if pandas and isinstance(self.ground_truth_data, pandas.DataFrame):
_check_dataset_is_within_size_limit(self.ground_truth_data)
@dataclasses.dataclass
class EvaluationTextClassificationSpec(_EvaluationTaskSpec):
"""Spec for text classification model evaluation tasks.
Args:
target_column_name (str):
Required. The label column in the dataset provided in `ground_truth_data`. Required when task_name='text-classification'.
class_names (List[str]):
Required. A list of all possible label names in your dataset. Required when task_name='text-classification'.
"""
target_column_name: str
class_names: List[str]
@property
def task_name(self) -> str:
return "text-classification"
@dataclasses.dataclass
class EvaluationTextGenerationSpec(_EvaluationTaskSpec):
"""Spec for text generation model evaluation tasks."""
@property
def task_name(self) -> str:
return "text-generation"
@dataclasses.dataclass
class EvaluationQuestionAnsweringSpec(_EvaluationTaskSpec):
"""Spec for question answering model evaluation tasks."""
task_name: str = "question-answering"
@dataclasses.dataclass
class EvaluationTextSummarizationSpec(_EvaluationTaskSpec):
"""Spec for text summarization model evaluation tasks."""
task_name: str = "summarization"
@dataclasses.dataclass
class _EvaluationMetricBase:
"""Base class for returned evaulation metrics."""
@property
def input_dataset_paths(self) -> str:
"""The Google Cloud Storage paths to the dataset used for this evaluation."""
pass
@property
def task_name(self) -> str:
"""The type of evaluation task for the evaluation.."""
pass
@dataclasses.dataclass
class EvaluationMetric(_EvaluationMetricBase):
"""The evaluation metric response.
Args:
bleu (float):
Optional. BLEU (Bilingual evauation understudy). Scores based on sacrebleu implementation.
rougeLSum (float):
Optional. ROUGE-L (Longest Common Subsequence) scoring at summary level.
"""
bleu: Optional[float] = None
rougeLSum: Optional[float] = None
@dataclasses.dataclass
class EvaluationClassificationMetric(_EvaluationMetricBase):
"""The evaluation metric response for classification metrics.
Args:
label_name (str):
Optional. The name of the label associated with the metrics. This is only
returned when `only_summary_metrics=False` is passed to evaluate().
auPrc (float):
Optional. The area under the precision recall curve.
auRoc (float):
Optional. The area under the receiver operating characteristic curve.
logLoss (float):
Optional. Logarithmic loss.
confidenceMetrics (List[Dict[str, Any]]):
Optional. This is only returned when `only_summary_metrics=False` is
passed to evaluate().
confusionMatrix (Dict[str, Any]):
Optional. This is only returned when `only_summary_metrics=False` is
passed to evaluate().
"""
label_name: Optional[str] = None
auPrc: Optional[float] = None
auRoc: Optional[float] = None
logLoss: Optional[float] = None
confidenceMetrics: Optional[List[Dict[str, Any]]] = None
confusionMatrix: Optional[Dict[str, Any]] = None
@dataclasses.dataclass
class EvaluationSlicedClassificationMetric(_EvaluationMetricBase):
"""The evaluation metric slices returned for classification metrics.
This is returned when `only_summary_metrics=False` is passed to evaluate().
Args:
overall_metrics (EvaluationClassificationMetric):
The evaluation metrics across all slices of data
slices (List[EvaluationClassificationMetric]):
The evaluation metrics for each label slice.
"""
overall_metrics: Optional[EvaluationClassificationMetric] = None
slices: Optional[List[EvaluationClassificationMetric]] = None
def _populate_eval_template_params(
task_spec: _EvaluationTaskSpec,
model_name: str,
service_account: Optional[str] = None,
machine_type: Optional[str] = None,
network: Optional[str] = None,
encryption_spec_key_name: Optional[str] = None,
) -> Dict[str, Any]:
"""Populates a dictionary of template parameters for the evaluation PipelineJob.
Args:
task_spec (EvaluationTaskSpec):
The EvaluationTaskSpec passed to evaluate() for this job
model_name (str):
The resource name of the model being evaluated. Either a PublisherModel or
ModelRegistry resource name.
service_account (Optional[str]):
The default service account for workload run-as account.
machine_type (Optional[str]):
Optional. The type of the machine to run the evaluation job on.
network (Optional[str]):
Optional.
encryption_spec_key_name (Optional[str]):
Optional.
Returns:
Dict[str, Any]:
A dictionary of template parameter names and values to be passed to the PipelineJob
running the model evaluation.
"""
ground_truth_data_gcs_path = task_spec.ground_truth_data
staging_bucket = aiplatform_initializer.global_config.staging_bucket
if not staging_bucket:
staging_bucket = (
gcs_utils.create_gcs_bucket_for_pipeline_artifacts_if_it_does_not_exist()
)
timestamped_eval_directory = (
f"evaluation_data_{aiplatform_utils.timestamped_unique_name()}"
)
if isinstance(task_spec.ground_truth_data, pandas.DataFrame):
# Convert to jsonl file and upload to gcs
dataset_uri = os.path.join(
staging_bucket,
timestamped_eval_directory,
"eval_data.jsonl",
)
gcs_utils._upload_pandas_df_to_gcs(
df=task_spec.ground_truth_data, upload_gcs_path=dataset_uri
)
ground_truth_data_gcs_path = [dataset_uri]
template_params = {
"project": aiplatform_initializer.global_config.project,
"location": aiplatform_initializer.global_config.location,
"batch_predict_gcs_destination_output_uri": f"{staging_bucket}/{timestamped_eval_directory}",
"model_name": model_name,
"batch_predict_gcs_source_uris": ground_truth_data_gcs_path,
"service_account": service_account,
"machine_type": machine_type,
"encrytion_spec_key_name": encryption_spec_key_name
or aiplatform_initializer.global_config.encryption_spec_key_name,
"network": network or aiplatform_initializer.global_config.network,
}
if task_spec.task_name == _TEXT_CLASSIFICATION_TASK_NAME:
template_params["evaluation_class_labels"] = task_spec.class_names
template_params["target_field_name"] = task_spec.target_column_name
else:
template_params["evaluation_task"] = task_spec.task_name
return template_params
# TODO (b/285947054): update to use public pipeline contract
def _get_gcs_uri_from_pipeline_task_details(
pipeline_job: aiplatform.PipelineJob,
) -> Optional[str]:
"""Gets the GCS URI from the PipelineJob output.
Args:
pipeline_job (aiplatform.PipelineJob)
The PipelineJob resource to get the metrics GCS URI from
Returns:
The GCS URI of the evaluation metrics as a string.
"""
for task in pipeline_job.task_details:
if task.task_name == pipeline_job.name and "evaluation_metrics" in task.outputs:
return task.outputs["evaluation_metrics"].artifacts[0].uri
def _convert_metrics_dict_to_response_type(
metrics_json: Dict[str, Any],
metric_type: Type[T],
metric_name: Optional[str] = None,
) -> EvaluationClassificationMetric:
metrics_response = metric_type()
if metric_name:
metrics_response.label_name = metric_name
for metric, value in metrics_json.items():
if hasattr(metrics_response, metric):
setattr(metrics_response, metric, value)
return metrics_response
def _format_classification_metrics(
metrics: Dict[str, Any]
) -> EvaluationSlicedClassificationMetric:
"""Reformats classification metrics returned by the eval pipeline to make them more readable.
Returned metrics are of type EvaluationSlicedClassificationMetric, with `overall` representing
the metrics for all data, and `slices` representing the metrics for each label in the dataset.
Example schema of reformatted metrics:
EvaluationSlicedClassificationMetrics(
overall_metrics=EvaluationClassificationMetric(
auPrc=...
)
slices=[
EvaluationClassificationMetric(
label_name="overall",
auPrc=...,
...
),
EvaluationClassificationMetric(
label_name="label_1",
auPrc=...,
...
),
EvaluationClassificationMetric(
label_name="label_2",
auPrc=...,
...
)
]
)
"""
reformatted_metrics = EvaluationSlicedClassificationMetric()
# TODO: see if we can do this without relying on specific keys, i.e. slicedMetrics
# First add overall metrics
overall_metrics = _convert_metrics_dict_to_response_type(
metrics_json=metrics["slicedMetrics"][0]["metrics"]["classification"],
metric_type=EvaluationClassificationMetric,
)
reformatted_metrics.overall_metrics = overall_metrics
sliced_metrics = []
# Then add metrics for each slice
for idx in range(1, len(metrics["slicedMetrics"])):
metric_slice_name = metrics["slicedMetrics"][idx]["singleOutputSlicingSpec"][
"value"
]
sliced_metric = _convert_metrics_dict_to_response_type(
metrics_json=metrics["slicedMetrics"][idx]["metrics"]["classification"],
metric_type=EvaluationClassificationMetric,
metric_name=metric_slice_name,
)
sliced_metrics.append(sliced_metric)
reformatted_metrics.sliced_metrics = sliced_metrics
return reformatted_metrics
def _get_metrics_from_gcs_uri(
gcs_uri: str,
) -> Union[
EvaluationMetric,
EvaluationClassificationMetric,
EvaluationSlicedClassificationMetric,
]:
"""Downloads evaluation metrics from GCS path."""
storage_client = storage.Client(
credentials=aiplatform_initializer.global_config.credentials
)
metrics_json = storage.Blob.from_string(
uri=gcs_uri, client=storage_client
).download_as_text()
# Sliced classification metrics case, format data
if "slicedMetrics" in metrics_json:
return _format_classification_metrics(metrics_json)
# If classification metrics don't contain slices, use EvaluationClassificationMetric type
if "auPrc" in metrics_json:
metrics_response = _convert_metrics_dict_to_response_type(
metrics_json=metrics_json,
metric_type=EvaluationClassificationMetric,
)
# All other metric types
else:
metrics_response = _convert_metrics_dict_to_response_type(
metrics_json=metrics_json,
metric_type=EvaluationMetric,
)
return metrics_response
def _get_metrics_from_pipeline_task_details(
pipeline_job: aiplatform.PipelineJob,
) -> Union[EvaluationMetric, EvaluationClassificationMetric]:
"""Gets the evaluation metrics from the PipelineJob TaskDetails.
Args:
pipeline_job (aiplatform.PipelineJob)
The PipelineJob resource to get the metrics from
Returns:
A dictionary with the evaluation metrics
"""
metrics = {}
# TODO (b/292076101): this now uses a public pipelines contract, but still relies on task_details
for task in pipeline_job.task_details:
if task.task_name == pipeline_job.name:
for output in task.outputs:
for metric_name, metric_value in (
task.outputs[output].artifacts[0].metadata.items()
):
metrics[metric_name] = metric_value
if "auPrc" in metrics:
metrics_response = EvaluationClassificationMetric()
else:
metrics_response = EvaluationMetric()
for metric, value in metrics.items():
if hasattr(metrics_response, metric):
setattr(metrics_response, metric, value)
return metrics_response
class _LanguageModelEvaluationJob:
"""Represents a model evaluation job for LLM models.
These evaluation jobs are run as a Vertex Pipeline.
"""
def __init__(
self,
pipeline_job: aiplatform.PipelineJob,
):
self._pipeline_job = pipeline_job
def result(
self, *, only_summary_metrics: bool
) -> Union[EvaluationMetric, EvaluationClassificationMetric]:
"""Blocks on completion of the model evaluation PipelineJob and returns metrics."""
self._pipeline_job.wait()
if only_summary_metrics:
return _get_metrics_from_pipeline_task_details(self._pipeline_job)
else:
gcs_uri = _get_gcs_uri_from_pipeline_task_details(self._pipeline_job)
if gcs_uri:
return _get_metrics_from_gcs_uri(gcs_uri)
class _EvaluatableLanguageModel:
"""Mixin class for LLMs that support model evaluation."""
# TODO (b/282975912): convert training job specific args to a TrainingConfig
def evaluate(
self,
*,
task_spec: _EvaluationTaskSpec,
only_summary_metrics: Optional[bool] = True,
machine_type: Optional[str] = None,
) -> Union[
EvaluationMetric,
EvaluationClassificationMetric,
EvaluationSlicedClassificationMetric,
]:
"""Runs model evaluation using the provided input and ground truth data.
This creates an evaluation job and blocks until the job completes, about
10 - 20 minutes.
Example:
```
model = TextGenerationModel.from_pretrained("text-bison@001")
eval_metrics = model.evaluate(
task_spec=EvaluationTextGenerationSpec(
ground_truth_data="gs://my-bucket/ground-truth.jsonl",
)
)
```
Args:
task_spec (_EvaluationTaskSpec):
Required. The configuration spec for your model evaluation job. Choose the spec corresponding
with the evaluation task you are performing, one of: EvaluationClassificationSpec, EvaluationTextGenerationSpec,
EvaluationTextSummarizationSpec, EvaluationQuestionAnsweringSpec.
For example, a valid classification `task_spec` is:
EvaluationTextClassificationSpec(
ground_truth_data=["gs://bucket/path/to/your/data.jsonl"],
class_names=["cheddar", "gouda", "camembert"],
target_column_name="cheese_type",
)
only_summary_metrics (bool):
Optional. Setting this field to False only affects the metrics returned for text classification tasks.
When False, text classification metrics will include additional sliced metrics fields, with metrics for
each label slice in the data.
machine_type (str):
Optional. The type of the machine to run the evaluation job on. The default value is "e2-highmem-16". For
tasks with a large evaluation dataset, a bigger machine type may be required.
For more details about this input config, see
https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types.
Returns:
Union[EvaluationMetric, EvaluationClassificationMetric, List[EvaluationClassificationMetric]]
The evaluation metrics from this evaluation job. When `only_summary_metrics=False` is passed
and the evaluation task type is 'text-classification', the return type will be List[EvaluationClassificationMetric],
where each value in the list is the metrics associated with a particular classification label.
"""
model_info = _model_garden_models._get_model_info(
self._model_id,
schema_to_class_map={self._INSTANCE_SCHEMA_URI: type(self)},
)
model_name = _get_model_resource_name_and_validate(
model_name=self._model_resource_name, model_info=model_info
)
# TODO(b/296402511): get service_account from aiplatform_initializer and pass it to the template here and to PipelineJob after cl/539823838 is submitted
template_params = _populate_eval_template_params(
task_spec=task_spec,
model_name=model_name,
machine_type=machine_type,
network=aiplatform_initializer.global_config.network,
encryption_spec_key_name=aiplatform_initializer.global_config.encryption_spec_key_name,
)
template_path = _get_template_url(task_spec.task_name)
pipeline_job = aiplatform.PipelineJob(
template_path=template_path,
parameter_values=template_params,
display_name=f"llm-eval-sdk-{aiplatform_utils.timestamped_unique_name()}",
)
pipeline_job.submit()
eval_job = _LanguageModelEvaluationJob(pipeline_job=pipeline_job)
_LOGGER.info(
"Your evaluation job is running and will take 15-20 minutes to complete. Click on the PipelineJob link to view progress."
)
# NOTE: only_summary_metrics is passed because getting metrics from the artifact is faster than downloading from GCS
# GCS is only needed for additional metrics for text-classification tasks
return eval_job.result(only_summary_metrics=only_summary_metrics)
def list_evaluation_metrics(
self,
*,
task_name: Optional[str] = None,
only_summary_metrics: Optional[bool] = True,
) -> List[Union[EvaluationMetric, EvaluationClassificationMetric]]:
"""Lists the evaluation metrics from all evaluation jobs run on this model.
Args:
task_name (str):
Optional. The task name to return evaluation metrics for. If provided, this will only return evaluation
metrics for tasks of the provided type. This matches the possible values passed to EvaluationTaskType.task_name,
and must be one of 'text-generation', 'text-classification', 'summarization', or 'question-answering'.
Returns:
Dict[str, Any]
The evaluation metrics from all evaluation jobs run on this model.
"""
model_name = self._model_resource_name
publisher_model_parts = model_garden_service_client.ModelGardenServiceClient.parse_publisher_model_path(
"".join(model_name.rpartition("publishers")[1:])
)
if publisher_model_parts:
model_id = publisher_model_parts["model"]
model_name = f"publishers/google/models/{model_id}"
filters = f'metadata.component_type.string_value={_EVALUATION_PIPELINE_COMPONENT_IDENTIFIER} AND metadata."input:model_name".string_value={model_name} AND (metadata."input:evaluation_task".string_value={_TEXT_GENERATION_TASK_NAME} OR metadata."input:evaluation_task".string_value={_SUMMARIZATION_TASK_NAME} OR metadata."input:evaluation_task".string_value={_QA_TASK_NAME} OR metadata."input:evaluation_task".string_value={_TEXT_CLASSIFICATION_TASK_NAME})'
# NOTE: when task_name is appended to the filter the block of OR filters in `filters` above becomes a no-op
if task_name:
filters += f' AND metadata."input:evaluation_task".string_value={task_name}'
filtered_pipeline_executions = aiplatform.Execution.list(
filter=filters,
project=aiplatform_initializer.global_config.project,
location=aiplatform_initializer.global_config.location,
credentials=aiplatform_initializer.global_config.credentials,
)
model_eval_metrics = []
# TODO (b/285950380): improve performance of this method
for pipeline_execution in filtered_pipeline_executions:
if "pipeline_job_resource_name" not in pipeline_execution.metadata:
continue
pipeline_job_resource = aiplatform.PipelineJob.get(
resource_name=pipeline_execution.metadata["pipeline_job_resource_name"]
)
eval_job_state = pipeline_job_resource._gca_resource.state
if (
eval_job_state
!= gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
):
continue
metrics = None
if only_summary_metrics:
metrics = _get_metrics_from_pipeline_task_details(pipeline_job_resource)
else:
gcs_uri = _get_gcs_uri_from_pipeline_task_details(pipeline_job_resource)
if gcs_uri:
metrics = _get_metrics_from_gcs_uri(gcs_uri)
metrics.input_dataset_paths = pipeline_execution.metadata[
"input:batch_predict_gcs_source_uris"
]
metrics.task_name = pipeline_execution.metadata["input:evaluation_task"]
model_eval_metrics.append(metrics)
return model_eval_metrics

File diff suppressed because it is too large Load Diff