structure saas with tools

2025-04-25 15:30:54 -03:00
commit 1aef473937
16434 changed files with 6584257 additions and 0 deletions
--- a/.venv/lib/python3.10/site-packages/vertexai/language_models/init.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/language_models/init.py
@@ -0,0 +1,47 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Classes for working with language models."""
+
+from vertexai.language_models._language_models import (
+    ChatMessage,
+    ChatModel,
+    ChatSession,
+    CodeChatModel,
+    CodeChatSession,
+    CodeGenerationModel,
+    InputOutputTextPair,
+    TextEmbedding,
+    TextEmbeddingInput,
+    TextEmbeddingModel,
+    TextGenerationModel,
+    TextGenerationResponse,
+    GroundingSource,
+)
+
+__all__ = [
+    "ChatMessage",
+    "ChatModel",
+    "ChatSession",
+    "CodeChatModel",
+    "CodeChatSession",
+    "CodeGenerationModel",
+    "InputOutputTextPair",
+    "TextEmbedding",
+    "TextEmbeddingInput",
+    "TextEmbeddingModel",
+    "TextGenerationModel",
+    "TextGenerationResponse",
+    "GroundingSource",
+]
--- a/.venv/lib/python3.10/site-packages/vertexai/language_models/pycache/init.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/language_models/pycache/init.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/language_models/pycache/_distillation.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/language_models/pycache/_distillation.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/language_models/pycache/_evaluatable_language_models.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/language_models/pycache/_evaluatable_language_models.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/language_models/pycache/_language_models.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/language_models/pycache/_language_models.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/language_models/_distillation.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/language_models/_distillation.py
@@ -0,0 +1,141 @@
+from typing import Optional, Union
+
+from google.cloud import aiplatform
+from google.cloud.aiplatform import initializer as aiplatform_initializer
+from vertexai.language_models import _language_models
+from vertexai.language_models import _language_models as tuning
+
+
+_DISTILLATION_PIPELINE_URI = (
+    "https://us-kfp.pkg.dev/ml-pipeline/distillation/distillation/v1.0.0"
+)
+
+
+class DistillationMixin:
+    def distill_from(
+        self,
+        *,
+        dataset: str,
+        teacher_model: Union[str, _language_models._TextGenerationModel],
+        train_steps: Optional[int] = None,
+        learning_rate_multiplier: Optional[float] = None,
+        evaluation_spec: Optional[tuning.TuningEvaluationSpec] = None,
+        accelerator_type: Optional[tuning._ACCELERATOR_TYPE_TYPE] = None,
+        model_display_name: Optional[str] = None,
+        max_context_length: Optional[str] = None,
+    ):
+        """Tunes a smaller model with help from another bigger model.
+
+        Args:
+            dataset: A URI pointing to data in JSON lines format.
+            teacher_model: The teacher model to use for distillation.
+            train_steps: Number of training batches to use (batch size is 8 samples).
+            learning_rate_multiplier: Learning rate multiplier to use in tuning.
+            evaluation_spec: Specification for the model evaluation during tuning.
+            accelerator_type: Type of accelerator to use. Can be "TPU" or "GPU".
+            model_display_name: Custom display name for the tuned model.
+            max_context_length: The max context length used for tuning.
+                Can be either '8k' or '32k'
+
+        Returns:
+            A tuning job for distillation.
+
+        Raises:
+            RuntimeError: If the model does not support distillation.
+        """
+        if "/models/" not in self._endpoint_name:
+            raise RuntimeError(
+                f"Model does not support distillation: {self._endpoint_name}"
+            )
+        student_short_model_id = self._endpoint_name.split("/")[-1]
+
+        if isinstance(teacher_model, str):
+            teacher_short_model_id = teacher_model
+        elif isinstance(teacher_model, _language_models._LanguageModel):
+            if "/models/" not in teacher_model._endpoint_name:
+                raise RuntimeError(
+                    f"Teacher model does not support distillation: {teacher_model._endpoint_name}"
+                )
+            teacher_short_model_id = teacher_model._endpoint_name.split("/")[-1]
+        else:
+            raise RuntimeError(f"Unsupported teacher model type: {teacher_model}")
+
+        pipeline_job = submit_distillation_pipeline_job(
+            teacher_model=teacher_short_model_id,
+            student_model=student_short_model_id,
+            dataset=dataset,
+            train_steps=train_steps,
+            learning_rate_multiplier=learning_rate_multiplier,
+            evaluation_spec=evaluation_spec,
+            accelerator_type=accelerator_type,
+            model_display_name=model_display_name,
+            max_context_length=max_context_length,
+        )
+        tuning_job = tuning._LanguageModelTuningJob(
+            base_model=self,
+            job=pipeline_job,
+        )
+        return tuning_job
+
+
+def submit_distillation_pipeline_job(
+    *,
+    teacher_model: str,
+    student_model: str,
+    dataset: str,
+    train_steps: Optional[int] = None,
+    learning_rate_multiplier: Optional[float] = None,
+    evaluation_spec: Optional[tuning.TuningEvaluationSpec] = None,
+    accelerator_type: Optional[tuning._ACCELERATOR_TYPE_TYPE] = None,
+    model_display_name: Optional[str] = None,
+    max_context_length: Optional[str] = None,
+):
+    teacher_short_model_id = teacher_model.split("/")[-1]
+    student_short_model_id = student_model.split("/")[-1]
+    pipeline_arguments = {
+        "teacher_model_reference": teacher_model,
+        "student_model_reference": student_model,
+        "dataset_uri": dataset,
+        "project": aiplatform_initializer.global_config.project,
+        "location": aiplatform_initializer.global_config.location,
+    }
+    if train_steps is not None:
+        pipeline_arguments["train_steps"] = train_steps
+    if learning_rate_multiplier is not None:
+        pipeline_arguments["learning_rate_multiplier"] = learning_rate_multiplier
+    if evaluation_spec is not None:
+        pipeline_arguments["evaluation_data_uri"] = evaluation_spec.evaluation_data
+        pipeline_arguments["evaluation_interval"] = evaluation_spec.evaluation_interval
+        pipeline_arguments[
+            "enable_early_stopping"
+        ] = evaluation_spec.enable_early_stopping
+        pipeline_arguments[
+            "enable_checkpoint_selection"
+        ] = evaluation_spec.enable_checkpoint_selection
+        pipeline_arguments["tensorboard_resource_id"] = evaluation_spec.tensorboard
+        # pipeline_parameter_values["evaluation_output_root_dir"] = ...
+    if accelerator_type is not None:
+        pipeline_arguments["accelerator_type"] = accelerator_type
+    if aiplatform_initializer.global_config.encryption_spec_key_name is not None:
+        pipeline_arguments[
+            "encryption_spec_key_name"
+        ] = aiplatform_initializer.global_config.encryption_spec_key_name
+    if max_context_length is not None:
+        pipeline_arguments["max_context_length"] = max_context_length
+    if model_display_name is None:
+        model_display_name = (
+            f"{student_short_model_id} distilled from {teacher_short_model_id}"
+        )
+    pipeline_arguments["model_display_name"] = model_display_name
+    # # Not exposing these parameters:
+    # temperature: Optional[float] = None,
+    # tpu_training_skip_cmek: Optional[bool] = None,
+    # api_endpoint: Optional[str] = None,
+    # version: Optional[str] = None,
+    pipeline_job = aiplatform.PipelineJob(
+        template_path=_DISTILLATION_PIPELINE_URI,
+        display_name=None,
+        parameter_values=pipeline_arguments,
+    )
+    pipeline_job.submit()
+    return pipeline_job
--- a/.venv/lib/python3.10/site-packages/vertexai/language_models/_evaluatable_language_models.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/language_models/_evaluatable_language_models.py
@@ -0,0 +1,754 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Classes for working with language models."""
+
+import dataclasses
+import os
+from typing import Any, Dict, List, Optional, Type, TypeVar, Union
+
+from google.cloud import storage
+
+from google.cloud import aiplatform
+from google.cloud.aiplatform import base
+from google.cloud.aiplatform import initializer as aiplatform_initializer
+from google.cloud.aiplatform import utils as aiplatform_utils
+from google.cloud.aiplatform.utils import gcs_utils
+from vertexai._model_garden import _model_garden_models
+
+from google.cloud.aiplatform.compat.services import (
+    model_garden_service_client,
+)
+from google.cloud.aiplatform.compat.types import (
+    pipeline_state as gca_pipeline_state,
+)
+
+try:
+    import pandas
+except ImportError:
+    pandas = None
+
+
+_LOGGER = base.Logger(__name__)
+
+# Model Evaluation constants
+_TEXT_CLASSIFICATION_TASK_NAME = "text-classification"
+_TEXT_GENERATION_TASK_NAME = "text-generation"
+_QA_TASK_NAME = "question-answering"
+_SUMMARIZATION_TASK_NAME = "summarization"
+
+_EVALUATION_TASKS = frozenset(
+    [
+        _TEXT_CLASSIFICATION_TASK_NAME,
+        _TEXT_GENERATION_TASK_NAME,
+        _QA_TASK_NAME,
+        _SUMMARIZATION_TASK_NAME,
+    ]
+)
+
+
+_TEXT_CLASSIFICATION_TEMPLATE_URL = "https://us-kfp.pkg.dev/vertex-evaluation/pipeline-templates/evaluation-llm-classification-pipeline"
+_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL = "https://us-kfp.pkg.dev/vertex-evaluation/pipeline-templates/evaluation-llm-text-generation-pipeline"
+
+_EVALUATION_TEMPLATE_VERSION_TAG = "2.9.0"
+
+_EVALUATION_TEMPLATE_URLS = {
+    _TEXT_CLASSIFICATION_TASK_NAME: f"{_TEXT_CLASSIFICATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}",
+    _TEXT_GENERATION_TASK_NAME: f"{_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}",
+    _QA_TASK_NAME: f"{_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}",
+    _SUMMARIZATION_TASK_NAME: f"{_TEXT_GENERATION_QA_SUMMARIZATION_TEMPLATE_URL}/{_EVALUATION_TEMPLATE_VERSION_TAG}",
+}
+
+
+_EVALUATION_PIPELINE_COMPONENT_IDENTIFIER = "fpc-llm-evaluation"
+
+_BATCH_PREDICTION_ROW_LIMIT = 30000
+
+_EVAL_SUPPORTED_BASE_MODELS = ["text-bison@001", "text-bison@002"]
+
+T = TypeVar("T", bound="_EvaluationMetricBase")
+
+
+def _check_dataset_is_within_size_limit(
+    data: "pandas.DataFrame",
+) -> None:
+
+    if len(data) < _BATCH_PREDICTION_ROW_LIMIT:
+        return
+
+    raise ValueError(
+        f"Your evaluation dataset size exceeds the limit of {_BATCH_PREDICTION_ROW_LIMIT}"
+    )
+
+
+def _get_model_resource_name_and_validate(
+    model_name: str,
+    model_info: _model_garden_models._ModelInfo,
+) -> str:
+    """Returns the resource name string for the model.
+
+    Model Registry resource names will stay the same. For Publisher Models, we need to
+    pass the full resource name (publishers/google/models/text-bison@001) to the evaluation
+    template and ensure the base model supports evaluation.
+
+    Args:
+        model_name (str):
+            Required. The full resource name of the Model Registry model or base publisher model
+            to run evaluation on.
+        model_info (_model_garden_models._ModelInfo):
+            Required. The _ModelInfo object for the instance.
+
+    Returns:
+        The formatted model_name string.
+
+    Raises:
+        ValueError
+            If a base PublisherModel was provided and the model doesn't support evaluation.
+    """
+
+    if "publishers/" not in model_name:
+        # Model Registry resource
+        return model_name
+
+    else:
+        if model_info.tuning_model_id in _EVAL_SUPPORTED_BASE_MODELS:
+            return f"{model_info.publisher_model_resource.name}@{model_info.publisher_model_resource.version_id}"
+
+        raise ValueError(
+            f"The provided model {model_name} does not support evaluation."
+        )
+
+
+def _get_template_url(task_name: str) -> Optional[str]:
+    """Returns the pipeline template to use for the evaluation task.
+
+    Args:
+        task_name (str):
+            Required. The name of the evaluation task to run.
+
+    Returns:
+        The evaluation pipeline template path.
+    """
+
+    return _EVALUATION_TEMPLATE_URLS.get(task_name)
+
+
+@dataclasses.dataclass
+class _EvaluationTaskSpec:
+    """Base class for task-specific model evaluation configuration parameters.
+
+    This class should not be instantiated directly, instead use the subclass corresponding
+    to your evaluation task.
+
+    Args:
+        ground_truth_data (Union[List[str], str, pandas.DataFrame]):
+            Required. The ground truth data to use for this evaluation job. This can be
+            either a Pandas DataFrame, a Cloud Storage URI of your JSONL data file, or a list of multiple
+            JSONL files on Cloud Storage.
+
+    Raises:
+        ValueError:
+            If task_spec.ground_truth_data is formatted incorrectly.
+            If task_spec.ground_truth_data is a Pandas DataFrame and exceeds 1000 rows.
+            If task_spec.ground_truth_data is not a string, list, or Pandas DataFrame.
+    """
+
+    ground_truth_data: Union[List[str], str, "pandas.DataFrame"]
+
+    @property
+    def task_name(self) -> str:
+        pass
+
+    def __post_init__(self):
+
+        if isinstance(self.ground_truth_data, str):
+            self.ground_truth_data = [self.ground_truth_data]
+
+        if isinstance(self.ground_truth_data, list) and not all(
+            item.startswith("gs://") for item in self.ground_truth_data
+        ):
+            raise ValueError("Please provide a valid GCS URI starting with 'gs://'")
+
+        if pandas and isinstance(self.ground_truth_data, pandas.DataFrame):
+
+            _check_dataset_is_within_size_limit(self.ground_truth_data)
+
+
+@dataclasses.dataclass
+class EvaluationTextClassificationSpec(_EvaluationTaskSpec):
+    """Spec for text classification model evaluation tasks.
+
+    Args:
+        target_column_name (str):
+            Required. The label column in the dataset provided in `ground_truth_data`. Required when task_name='text-classification'.
+        class_names (List[str]):
+            Required. A list of all possible label names in your dataset. Required when task_name='text-classification'.
+    """
+
+    target_column_name: str
+    class_names: List[str]
+
+    @property
+    def task_name(self) -> str:
+        return "text-classification"
+
+
+@dataclasses.dataclass
+class EvaluationTextGenerationSpec(_EvaluationTaskSpec):
+    """Spec for text generation model evaluation tasks."""
+
+    @property
+    def task_name(self) -> str:
+        return "text-generation"
+
+
+@dataclasses.dataclass
+class EvaluationQuestionAnsweringSpec(_EvaluationTaskSpec):
+    """Spec for question answering model evaluation tasks."""
+
+    task_name: str = "question-answering"
+
+
+@dataclasses.dataclass
+class EvaluationTextSummarizationSpec(_EvaluationTaskSpec):
+    """Spec for text summarization model evaluation tasks."""
+
+    task_name: str = "summarization"
+
+
+@dataclasses.dataclass
+class _EvaluationMetricBase:
+    """Base class for returned evaulation metrics."""
+
+    @property
+    def input_dataset_paths(self) -> str:
+        """The Google Cloud Storage paths to the dataset used for this evaluation."""
+        pass
+
+    @property
+    def task_name(self) -> str:
+        """The type of evaluation task for the evaluation.."""
+        pass
+
+
+@dataclasses.dataclass
+class EvaluationMetric(_EvaluationMetricBase):
+    """The evaluation metric response.
+
+    Args:
+        bleu (float):
+            Optional. BLEU (Bilingual evauation understudy). Scores based on sacrebleu implementation.
+        rougeLSum (float):
+            Optional. ROUGE-L (Longest Common Subsequence) scoring at summary level.
+    """
+
+    bleu: Optional[float] = None
+    rougeLSum: Optional[float] = None
+
+
+@dataclasses.dataclass
+class EvaluationClassificationMetric(_EvaluationMetricBase):
+    """The evaluation metric response for classification metrics.
+
+    Args:
+        label_name (str):
+            Optional. The name of the label associated with the metrics. This is only
+            returned when `only_summary_metrics=False` is passed to evaluate().
+        auPrc (float):
+            Optional. The area under the precision recall curve.
+        auRoc (float):
+            Optional. The area under the receiver operating characteristic curve.
+        logLoss (float):
+            Optional. Logarithmic loss.
+        confidenceMetrics (List[Dict[str, Any]]):
+            Optional. This is only returned when `only_summary_metrics=False` is
+            passed to evaluate().
+        confusionMatrix (Dict[str, Any]):
+          Optional. This is only returned when `only_summary_metrics=False` is
+          passed to evaluate().
+    """
+
+    label_name: Optional[str] = None
+    auPrc: Optional[float] = None
+    auRoc: Optional[float] = None
+    logLoss: Optional[float] = None
+    confidenceMetrics: Optional[List[Dict[str, Any]]] = None
+    confusionMatrix: Optional[Dict[str, Any]] = None
+
+
+@dataclasses.dataclass
+class EvaluationSlicedClassificationMetric(_EvaluationMetricBase):
+    """The evaluation metric slices returned for classification metrics.
+
+    This is returned when `only_summary_metrics=False` is passed to evaluate().
+
+    Args:
+        overall_metrics (EvaluationClassificationMetric):
+            The evaluation metrics across all slices of data
+        slices (List[EvaluationClassificationMetric]):
+            The evaluation metrics for each label slice.
+    """
+
+    overall_metrics: Optional[EvaluationClassificationMetric] = None
+    slices: Optional[List[EvaluationClassificationMetric]] = None
+
+
+def _populate_eval_template_params(
+    task_spec: _EvaluationTaskSpec,
+    model_name: str,
+    service_account: Optional[str] = None,
+    machine_type: Optional[str] = None,
+    network: Optional[str] = None,
+    encryption_spec_key_name: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Populates a dictionary of template parameters for the evaluation PipelineJob.
+
+    Args:
+        task_spec (EvaluationTaskSpec):
+            The EvaluationTaskSpec passed to evaluate() for this job
+        model_name (str):
+            The resource name of the model being evaluated. Either a PublisherModel or
+            ModelRegistry resource name.
+        service_account (Optional[str]):
+            The default service account for workload run-as account.
+        machine_type (Optional[str]):
+            Optional. The type of the machine to run the evaluation job on.
+        network (Optional[str]):
+            Optional.
+        encryption_spec_key_name (Optional[str]):
+            Optional.
+
+    Returns:
+        Dict[str, Any]:
+            A dictionary of template parameter names and values to be passed to the PipelineJob
+            running the model evaluation.
+    """
+
+    ground_truth_data_gcs_path = task_spec.ground_truth_data
+
+    staging_bucket = aiplatform_initializer.global_config.staging_bucket
+
+    if not staging_bucket:
+        staging_bucket = (
+            gcs_utils.create_gcs_bucket_for_pipeline_artifacts_if_it_does_not_exist()
+        )
+
+    timestamped_eval_directory = (
+        f"evaluation_data_{aiplatform_utils.timestamped_unique_name()}"
+    )
+
+    if isinstance(task_spec.ground_truth_data, pandas.DataFrame):
+
+        # Convert to jsonl file and upload to gcs
+        dataset_uri = os.path.join(
+            staging_bucket,
+            timestamped_eval_directory,
+            "eval_data.jsonl",
+        )
+
+        gcs_utils._upload_pandas_df_to_gcs(
+            df=task_spec.ground_truth_data, upload_gcs_path=dataset_uri
+        )
+        ground_truth_data_gcs_path = [dataset_uri]
+
+    template_params = {
+        "project": aiplatform_initializer.global_config.project,
+        "location": aiplatform_initializer.global_config.location,
+        "batch_predict_gcs_destination_output_uri": f"{staging_bucket}/{timestamped_eval_directory}",
+        "model_name": model_name,
+        "batch_predict_gcs_source_uris": ground_truth_data_gcs_path,
+        "service_account": service_account,
+        "machine_type": machine_type,
+        "encrytion_spec_key_name": encryption_spec_key_name
+        or aiplatform_initializer.global_config.encryption_spec_key_name,
+        "network": network or aiplatform_initializer.global_config.network,
+    }
+
+    if task_spec.task_name == _TEXT_CLASSIFICATION_TASK_NAME:
+        template_params["evaluation_class_labels"] = task_spec.class_names
+        template_params["target_field_name"] = task_spec.target_column_name
+    else:
+        template_params["evaluation_task"] = task_spec.task_name
+
+    return template_params
+
+
+# TODO (b/285947054): update to use public pipeline contract
+def _get_gcs_uri_from_pipeline_task_details(
+    pipeline_job: aiplatform.PipelineJob,
+) -> Optional[str]:
+    """Gets the GCS URI from the PipelineJob output.
+
+    Args:
+        pipeline_job (aiplatform.PipelineJob)
+            The PipelineJob resource to get the metrics GCS URI from
+
+    Returns:
+        The GCS URI of the evaluation metrics as a string.
+    """
+
+    for task in pipeline_job.task_details:
+        if task.task_name == pipeline_job.name and "evaluation_metrics" in task.outputs:
+            return task.outputs["evaluation_metrics"].artifacts[0].uri
+
+
+def _convert_metrics_dict_to_response_type(
+    metrics_json: Dict[str, Any],
+    metric_type: Type[T],
+    metric_name: Optional[str] = None,
+) -> EvaluationClassificationMetric:
+    metrics_response = metric_type()
+    if metric_name:
+        metrics_response.label_name = metric_name
+
+    for metric, value in metrics_json.items():
+        if hasattr(metrics_response, metric):
+            setattr(metrics_response, metric, value)
+    return metrics_response
+
+
+def _format_classification_metrics(
+    metrics: Dict[str, Any]
+) -> EvaluationSlicedClassificationMetric:
+    """Reformats classification metrics returned by the eval pipeline to make them more readable.
+
+    Returned metrics are of type EvaluationSlicedClassificationMetric, with `overall` representing
+    the metrics for all data, and `slices` representing the metrics for each label in the dataset.
+
+    Example schema of reformatted metrics:
+
+    EvaluationSlicedClassificationMetrics(
+        overall_metrics=EvaluationClassificationMetric(
+            auPrc=...
+        )
+        slices=[
+            EvaluationClassificationMetric(
+                label_name="overall",
+                auPrc=...,
+                ...
+            ),
+            EvaluationClassificationMetric(
+                label_name="label_1",
+                auPrc=...,
+                ...
+            ),
+            EvaluationClassificationMetric(
+                label_name="label_2",
+                auPrc=...,
+                ...
+            )
+        ]
+    )
+    """
+
+    reformatted_metrics = EvaluationSlicedClassificationMetric()
+
+    # TODO: see if we can do this without relying on specific keys, i.e. slicedMetrics
+
+    # First add overall metrics
+    overall_metrics = _convert_metrics_dict_to_response_type(
+        metrics_json=metrics["slicedMetrics"][0]["metrics"]["classification"],
+        metric_type=EvaluationClassificationMetric,
+    )
+    reformatted_metrics.overall_metrics = overall_metrics
+
+    sliced_metrics = []
+
+    # Then add metrics for each slice
+    for idx in range(1, len(metrics["slicedMetrics"])):
+        metric_slice_name = metrics["slicedMetrics"][idx]["singleOutputSlicingSpec"][
+            "value"
+        ]
+
+        sliced_metric = _convert_metrics_dict_to_response_type(
+            metrics_json=metrics["slicedMetrics"][idx]["metrics"]["classification"],
+            metric_type=EvaluationClassificationMetric,
+            metric_name=metric_slice_name,
+        )
+        sliced_metrics.append(sliced_metric)
+
+    reformatted_metrics.sliced_metrics = sliced_metrics
+
+    return reformatted_metrics
+
+
+def _get_metrics_from_gcs_uri(
+    gcs_uri: str,
+) -> Union[
+    EvaluationMetric,
+    EvaluationClassificationMetric,
+    EvaluationSlicedClassificationMetric,
+]:
+    """Downloads evaluation metrics from GCS path."""
+
+    storage_client = storage.Client(
+        credentials=aiplatform_initializer.global_config.credentials
+    )
+
+    metrics_json = storage.Blob.from_string(
+        uri=gcs_uri, client=storage_client
+    ).download_as_text()
+
+    # Sliced classification metrics case, format data
+    if "slicedMetrics" in metrics_json:
+        return _format_classification_metrics(metrics_json)
+    # If classification metrics don't contain slices, use EvaluationClassificationMetric type
+    if "auPrc" in metrics_json:
+        metrics_response = _convert_metrics_dict_to_response_type(
+            metrics_json=metrics_json,
+            metric_type=EvaluationClassificationMetric,
+        )
+    # All other metric types
+    else:
+        metrics_response = _convert_metrics_dict_to_response_type(
+            metrics_json=metrics_json,
+            metric_type=EvaluationMetric,
+        )
+    return metrics_response
+
+
+def _get_metrics_from_pipeline_task_details(
+    pipeline_job: aiplatform.PipelineJob,
+) -> Union[EvaluationMetric, EvaluationClassificationMetric]:
+    """Gets the evaluation metrics from the PipelineJob TaskDetails.
+
+    Args:
+        pipeline_job (aiplatform.PipelineJob)
+            The PipelineJob resource to get the metrics from
+
+    Returns:
+        A dictionary with the evaluation metrics
+    """
+    metrics = {}
+
+    # TODO (b/292076101): this now uses a public pipelines contract, but still relies on task_details
+    for task in pipeline_job.task_details:
+        if task.task_name == pipeline_job.name:
+            for output in task.outputs:
+                for metric_name, metric_value in (
+                    task.outputs[output].artifacts[0].metadata.items()
+                ):
+                    metrics[metric_name] = metric_value
+
+            if "auPrc" in metrics:
+                metrics_response = EvaluationClassificationMetric()
+            else:
+                metrics_response = EvaluationMetric()
+
+            for metric, value in metrics.items():
+                if hasattr(metrics_response, metric):
+                    setattr(metrics_response, metric, value)
+            return metrics_response
+
+
+class _LanguageModelEvaluationJob:
+    """Represents a model evaluation job for LLM models.
+
+    These evaluation jobs are run as a Vertex Pipeline.
+    """
+
+    def __init__(
+        self,
+        pipeline_job: aiplatform.PipelineJob,
+    ):
+        self._pipeline_job = pipeline_job
+
+    def result(
+        self, *, only_summary_metrics: bool
+    ) -> Union[EvaluationMetric, EvaluationClassificationMetric]:
+        """Blocks on completion of the model evaluation PipelineJob and returns metrics."""
+
+        self._pipeline_job.wait()
+
+        if only_summary_metrics:
+            return _get_metrics_from_pipeline_task_details(self._pipeline_job)
+        else:
+            gcs_uri = _get_gcs_uri_from_pipeline_task_details(self._pipeline_job)
+            if gcs_uri:
+                return _get_metrics_from_gcs_uri(gcs_uri)
+
+
+class _EvaluatableLanguageModel:
+    """Mixin class for LLMs that support model evaluation."""
+
+    # TODO (b/282975912): convert training job specific args to a TrainingConfig
+    def evaluate(
+        self,
+        *,
+        task_spec: _EvaluationTaskSpec,
+        only_summary_metrics: Optional[bool] = True,
+        machine_type: Optional[str] = None,
+    ) -> Union[
+        EvaluationMetric,
+        EvaluationClassificationMetric,
+        EvaluationSlicedClassificationMetric,
+    ]:
+        """Runs model evaluation using the provided input and ground truth data.
+
+        This creates an evaluation job and blocks until the job completes, about
+        10 - 20 minutes.
+
+        Example:
+        ```
+        model = TextGenerationModel.from_pretrained("text-bison@001")
+        eval_metrics = model.evaluate(
+            task_spec=EvaluationTextGenerationSpec(
+                ground_truth_data="gs://my-bucket/ground-truth.jsonl",
+            )
+        )
+        ```
+
+        Args:
+            task_spec (_EvaluationTaskSpec):
+                Required. The configuration spec for your model evaluation job. Choose the spec corresponding
+                with the evaluation task you are performing, one of: EvaluationClassificationSpec, EvaluationTextGenerationSpec,
+                EvaluationTextSummarizationSpec, EvaluationQuestionAnsweringSpec.
+
+                For example, a valid classification `task_spec` is:
+                EvaluationTextClassificationSpec(
+                    ground_truth_data=["gs://bucket/path/to/your/data.jsonl"],
+                    class_names=["cheddar", "gouda", "camembert"],
+                    target_column_name="cheese_type",
+                )
+            only_summary_metrics (bool):
+                Optional. Setting this field to False only affects the metrics returned for text classification tasks.
+                When False, text classification metrics will include additional sliced metrics fields, with metrics for
+                each label slice in the data.
+            machine_type (str):
+                Optional. The type of the machine to run the evaluation job on. The default value is "e2-highmem-16". For
+                tasks with a large evaluation dataset, a bigger machine type may be required.
+                For more details about this input config, see
+                https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types.
+
+        Returns:
+            Union[EvaluationMetric, EvaluationClassificationMetric, List[EvaluationClassificationMetric]]
+                The evaluation metrics from this evaluation job. When `only_summary_metrics=False` is passed
+                and the evaluation task type is 'text-classification', the return type will be List[EvaluationClassificationMetric],
+                where each value in the list is the metrics associated with a particular classification label.
+        """
+
+        model_info = _model_garden_models._get_model_info(
+            self._model_id,
+            schema_to_class_map={self._INSTANCE_SCHEMA_URI: type(self)},
+        )
+        model_name = _get_model_resource_name_and_validate(
+            model_name=self._model_resource_name, model_info=model_info
+        )
+
+        # TODO(b/296402511): get service_account from aiplatform_initializer and pass it to the template here and to PipelineJob after cl/539823838 is submitted
+        template_params = _populate_eval_template_params(
+            task_spec=task_spec,
+            model_name=model_name,
+            machine_type=machine_type,
+            network=aiplatform_initializer.global_config.network,
+            encryption_spec_key_name=aiplatform_initializer.global_config.encryption_spec_key_name,
+        )
+
+        template_path = _get_template_url(task_spec.task_name)
+
+        pipeline_job = aiplatform.PipelineJob(
+            template_path=template_path,
+            parameter_values=template_params,
+            display_name=f"llm-eval-sdk-{aiplatform_utils.timestamped_unique_name()}",
+        )
+        pipeline_job.submit()
+
+        eval_job = _LanguageModelEvaluationJob(pipeline_job=pipeline_job)
+
+        _LOGGER.info(
+            "Your evaluation job is running and will take 15-20 minutes to complete. Click on the PipelineJob link to view progress."
+        )
+
+        # NOTE: only_summary_metrics is passed because getting metrics from the artifact is faster than downloading from GCS
+        # GCS is only needed for additional metrics for text-classification tasks
+        return eval_job.result(only_summary_metrics=only_summary_metrics)
+
+    def list_evaluation_metrics(
+        self,
+        *,
+        task_name: Optional[str] = None,
+        only_summary_metrics: Optional[bool] = True,
+    ) -> List[Union[EvaluationMetric, EvaluationClassificationMetric]]:
+        """Lists the evaluation metrics from all evaluation jobs run on this model.
+
+        Args:
+            task_name (str):
+                Optional. The task name to return evaluation metrics for. If provided, this will only return evaluation
+                metrics for tasks of the provided type. This matches the possible values passed to EvaluationTaskType.task_name,
+                and must be one of 'text-generation', 'text-classification', 'summarization', or 'question-answering'.
+
+        Returns:
+            Dict[str, Any]
+                The evaluation metrics from all evaluation jobs run on this model.
+
+        """
+
+        model_name = self._model_resource_name
+
+        publisher_model_parts = model_garden_service_client.ModelGardenServiceClient.parse_publisher_model_path(
+            "".join(model_name.rpartition("publishers")[1:])
+        )
+
+        if publisher_model_parts:
+            model_id = publisher_model_parts["model"]
+            model_name = f"publishers/google/models/{model_id}"
+
+        filters = f'metadata.component_type.string_value={_EVALUATION_PIPELINE_COMPONENT_IDENTIFIER} AND metadata."input:model_name".string_value={model_name} AND (metadata."input:evaluation_task".string_value={_TEXT_GENERATION_TASK_NAME} OR metadata."input:evaluation_task".string_value={_SUMMARIZATION_TASK_NAME} OR metadata."input:evaluation_task".string_value={_QA_TASK_NAME} OR metadata."input:evaluation_task".string_value={_TEXT_CLASSIFICATION_TASK_NAME})'
+
+        # NOTE: when task_name is appended to the filter the block of OR filters in `filters` above becomes a no-op
+        if task_name:
+            filters += f' AND metadata."input:evaluation_task".string_value={task_name}'
+
+        filtered_pipeline_executions = aiplatform.Execution.list(
+            filter=filters,
+            project=aiplatform_initializer.global_config.project,
+            location=aiplatform_initializer.global_config.location,
+            credentials=aiplatform_initializer.global_config.credentials,
+        )
+
+        model_eval_metrics = []
+
+        # TODO (b/285950380): improve performance of this method
+        for pipeline_execution in filtered_pipeline_executions:
+            if "pipeline_job_resource_name" not in pipeline_execution.metadata:
+                continue
+
+            pipeline_job_resource = aiplatform.PipelineJob.get(
+                resource_name=pipeline_execution.metadata["pipeline_job_resource_name"]
+            )
+            eval_job_state = pipeline_job_resource._gca_resource.state
+
+            if (
+                eval_job_state
+                != gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+            ):
+                continue
+
+            metrics = None
+
+            if only_summary_metrics:
+                metrics = _get_metrics_from_pipeline_task_details(pipeline_job_resource)
+            else:
+                gcs_uri = _get_gcs_uri_from_pipeline_task_details(pipeline_job_resource)
+                if gcs_uri:
+                    metrics = _get_metrics_from_gcs_uri(gcs_uri)
+
+            metrics.input_dataset_paths = pipeline_execution.metadata[
+                "input:batch_predict_gcs_source_uris"
+            ]
+            metrics.task_name = pipeline_execution.metadata["input:evaluation_task"]
+
+            model_eval_metrics.append(metrics)
+
+        return model_eval_metrics
--- a/.venv/lib/python3.10/site-packages/vertexai/language_models/_language_models.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/language_models/_language_models.py