structure saas with tools

2025-04-25 15:30:54 -03:00
commit 1aef473937
16434 changed files with 6584257 additions and 0 deletions
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/init.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/init.py
@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Evaluation Metrics Module."""
+
+from vertexai.preview.evaluation.metrics import _base
+from vertexai.preview.evaluation.metrics import _rouge
+from vertexai.preview.evaluation.metrics import (
+    _trajectory_single_tool_use,
+)
+from vertexai.preview.evaluation.metrics import (
+    custom_output_config,
+)
+from vertexai.preview.evaluation.metrics import (
+    metric_prompt_template,
+)
+from vertexai.preview.evaluation.metrics import (
+    metric_prompt_template_examples,
+)
+from vertexai.preview.evaluation.metrics import pairwise_metric
+from vertexai.preview.evaluation.metrics import pointwise_metric
+from vertexai.preview.evaluation.metrics import (
+    predefined_rubric_metrics,
+)
+from vertexai.preview.evaluation.metrics import (
+    rubric_based_metric,
+)
+
+
+PairwiseMetric = pairwise_metric.PairwiseMetric
+PointwiseMetric = pointwise_metric.PointwiseMetric
+CustomMetric = _base.CustomMetric
+PairwiseMetricPromptTemplate = metric_prompt_template.PairwiseMetricPromptTemplate
+PointwiseMetricPromptTemplate = metric_prompt_template.PointwiseMetricPromptTemplate
+MetricPromptTemplateExamples = (
+    metric_prompt_template_examples.MetricPromptTemplateExamples
+)
+Rouge = _rouge.Rouge
+TrajectorySingleToolUse = _trajectory_single_tool_use.TrajectorySingleToolUse
+CustomOutputConfig = custom_output_config.CustomOutputConfig
+RubricBasedMetric = rubric_based_metric.RubricBasedMetric
+RubricGenerationConfig = _base.RubricGenerationConfig
+PredefinedRubricMetrics = predefined_rubric_metrics.PredefinedRubricMetrics
+
+
+__all__ = [
+    "CustomMetric",
+    "PairwiseMetric",
+    "PointwiseMetric",
+    "PairwiseMetricPromptTemplate",
+    "PointwiseMetricPromptTemplate",
+    "MetricPromptTemplateExamples",
+    "Rouge",
+    "TrajectorySingleToolUse",
+    "CustomOutputConfig",
+    "RubricBasedMetric",
+    "RubricGenerationConfig",
+    "PredefinedRubricMetrics",
+]
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/init.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/init.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/_base.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/_base.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/_default_templates.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/_default_templates.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/_instance_evaluation.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/_instance_evaluation.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/_rouge.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/_rouge.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/_schema.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/_schema.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/_trajectory_single_tool_use.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/_trajectory_single_tool_use.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/custom_output_config.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/custom_output_config.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/metric_prompt_template.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/metric_prompt_template.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/metric_prompt_template_examples.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/metric_prompt_template_examples.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/pairwise_metric.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/pairwise_metric.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/pointwise_metric.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/pointwise_metric.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/predefined_rubric_metrics.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/predefined_rubric_metrics.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/rubric_based_metric.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pycache/rubric_based_metric.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/_base.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/_base.py
@@ -0,0 +1,168 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Base classes for evaluation metrics."""
+
+import abc
+from typing import Any, Callable, Dict, Literal, Optional, Union, List
+
+from google.cloud.aiplatform_v1beta1.types import (
+    evaluation_service as gapic_eval_service_types,
+)
+from vertexai import generative_models
+from vertexai.preview.evaluation import constants
+from vertexai.preview.evaluation.metrics import (
+    custom_output_config as custom_output_config_class,
+)
+from vertexai.preview.evaluation.metrics import (
+    metric_prompt_template as metric_prompt_template_base,
+)
+
+
+_ModelType = Union[generative_models.GenerativeModel, Callable[[str], str]]
+
+
+class _Metric(abc.ABC):
+    """The abstract class for evaluation metric."""
+
+    def __init__(self, metric: str):
+        self._metric = metric
+
+    def __str__(self):
+        return self.metric_name
+
+    @property
+    def metric_name(self) -> str:
+        return self._metric
+
+
+class _ModelBasedMetric(_Metric):
+    """A Model-based Metric.
+
+    An evaluation metric that evaluates generative AI model responses with
+    another generative model as a judge. This metric can be used to evaluate a
+    single model, or two models side-by-side.
+
+    For more details on when to use model-based metrics, see
+    [Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval).
+    """
+
+    def __init__(
+        self,
+        *,
+        metric: str,
+        metric_prompt_template: Union[
+            metric_prompt_template_base.PointwiseMetricPromptTemplate,
+            metric_prompt_template_base.PairwiseMetricPromptTemplate,
+            str,
+        ],
+        system_instruction: Optional[str] = None,
+        autorater_config: Optional[gapic_eval_service_types.AutoraterConfig] = None,
+        custom_output_config: Optional[
+            custom_output_config_class.CustomOutputConfig
+        ] = None,
+    ):
+        """Initializes the model-based evaluation metric.
+
+        Args:
+          metric: Generic model based metric name.
+          metric_prompt_template: A metric prompt template for performing
+            the model-based evaluation. A freeform string is also accepted.
+          system_instruction: The system instruction to be used in the metric
+            prompt.
+          autorater_config: The config for judge model.
+          custom_output_config: Config for custom output from the judge model.
+        """
+        super().__init__(metric=metric)
+        self.metric_prompt_template = str(metric_prompt_template)
+        self.system_instruction = system_instruction
+        self.autorater_config = autorater_config
+        self.custom_output_config = custom_output_config
+
+
+class CustomMetric(_Metric):
+    """The custom evaluation metric.
+
+    A fully-customized CustomMetric that can be used to evaluate a single model
+    by defining a metric function for a computation-based metric. The
+    CustomMetric is computed on the client-side using the user-defined metric
+    function in SDK only, not by the Vertex Gen AI Evaluation Service.
+
+      Attributes:
+        name: The name of the metric.
+        metric_function: The user-defined evaluation function to compute a metric
+          score. Must use the dataset row dictionary as the metric function
+          input and return per-instance metric result as a dictionary output.
+          The metric score must mapped to the name of the CustomMetric as key.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        metric_function: Callable[
+            [Dict[str, Any]],
+            Dict[str, Any],
+        ],
+    ):
+        """Initializes the evaluation metric."""
+        super().__init__(name)
+        self.name = name
+        self.metric_function = metric_function
+
+
+class _AutomaticMetric(_Metric):
+    """An automatic metric that computes deterministic score based on reference.
+
+    An lexicon-based evaluation metric that evaluate a generative model's
+    response on the given evaluation task with reference ground truth answers.
+    It is a type of pointwise evaluation metric.
+
+    For more details on when to use automatic metrics, see
+    [Evaluation methods and
+    metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval).
+    """
+
+    def __init__(
+        self,
+        metric: Literal[constants.Metric.ROUGE],
+    ):
+        """Initializes the automatic evaluation metric.
+
+        Args:
+          metric: The automatic evaluation metric name.
+        """
+        super().__init__(metric=metric)
+
+
+class RubricGenerationConfig:
+    """The rubric generation config."""
+
+    def __init__(
+        self,
+        prompt_template: str,
+        model: Optional[_ModelType] = None,
+        parsing_fn: Optional[Callable[[str], List[str]]] = None,
+    ):
+        """Initializes the rubric generation config.
+
+        Args:
+          prompt_template: The prompt template for rubric generation.
+          model: The model to use for rubric generation.
+          parsing_fn: The function to parse the rubric generation response.
+        """
+        self.prompt_template = prompt_template
+        self.model = model
+        self.parsing_fn = parsing_fn
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/_default_templates.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/_default_templates.py
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/_instance_evaluation.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/_instance_evaluation.py
@@ -0,0 +1,802 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Library for metrics computation with Gen AI Evaluation Service."""
+
+import json
+from typing import Any, Dict, List, Union
+
+from google import api_core
+from google.cloud.aiplatform import base
+from google.cloud.aiplatform import initializer
+from google.cloud.aiplatform_v1beta1.services import (
+    evaluation_service as gapic_evaluation_services,
+)
+from google.cloud.aiplatform_v1beta1.types import (
+    evaluation_service as gapic_eval_service_types,
+)
+from vertexai.preview.evaluation import _base as eval_base
+from vertexai.preview.evaluation import constants
+from vertexai.preview.evaluation import multimodal_utils
+from vertexai.preview.evaluation import (
+    prompt_template as prompt_template_base,
+)
+from vertexai.preview.evaluation import utils
+from vertexai.preview.evaluation.metrics import (
+    _base as metrics_base,
+)
+from vertexai.preview.evaluation.metrics import (
+    _default_templates,
+)
+from vertexai.preview.evaluation.metrics import _rouge
+from vertexai.preview.evaluation.metrics import (
+    _trajectory_single_tool_use,
+)
+from vertexai.preview.evaluation.metrics import (
+    custom_output_config as custom_output_config_class,
+)
+from vertexai.preview.evaluation.metrics import pairwise_metric
+from vertexai.preview.evaluation.metrics import pointwise_metric
+from google.protobuf import json_format
+
+
+_LOGGER = base.Logger(__name__)
+_METRIC_NAME_TO_METRIC_SPEC = {
+    # Automatic Metrics.
+    constants.Metric.EXACT_MATCH: (gapic_eval_service_types.ExactMatchSpec()),
+    constants.Metric.BLEU: gapic_eval_service_types.BleuSpec(),
+    constants.Metric.ROUGE: gapic_eval_service_types.RougeSpec(),
+    constants.Metric.ROUGE_1: gapic_eval_service_types.RougeSpec(rouge_type="rouge1"),
+    constants.Metric.ROUGE_2: gapic_eval_service_types.RougeSpec(rouge_type="rouge2"),
+    constants.Metric.ROUGE_L: gapic_eval_service_types.RougeSpec(rouge_type="rougeL"),
+    constants.Metric.ROUGE_L_SUM: gapic_eval_service_types.RougeSpec(
+        rouge_type="rougeLsum"
+    ),
+    constants.Metric.TOOL_CALL_VALID: (gapic_eval_service_types.ToolCallValidSpec()),
+    constants.Metric.TOOL_NAME_MATCH: (gapic_eval_service_types.ToolNameMatchSpec()),
+    constants.Metric.TOOL_PARAMETER_KV_MATCH: (
+        gapic_eval_service_types.ToolParameterKVMatchSpec()
+    ),
+    constants.Metric.TOOL_PARAMETER_KEY_MATCH: (
+        gapic_eval_service_types.ToolParameterKeyMatchSpec()
+    ),
+    # Pointwise Metrics.
+    constants.Metric.POINTWISE_METRIC: (gapic_eval_service_types.PointwiseMetricSpec()),
+    # Pairwise Metrics.
+    constants.Metric.PAIRWISE_METRIC: (gapic_eval_service_types.PairwiseMetricSpec()),
+    constants.Metric.RUBRIC_BASED_INSTRUCTION_FOLLOWING: (
+        gapic_eval_service_types.RubricBasedInstructionFollowingSpec()
+    ),
+    constants.Metric.TRAJECTORY_EXACT_MATCH: (
+        gapic_eval_service_types.TrajectoryExactMatchSpec()
+    ),
+    constants.Metric.TRAJECTORY_IN_ORDER_MATCH: (
+        gapic_eval_service_types.TrajectoryInOrderMatchSpec()
+    ),
+    constants.Metric.TRAJECTORY_ANY_ORDER_MATCH: (
+        gapic_eval_service_types.TrajectoryAnyOrderMatchSpec()
+    ),
+    constants.Metric.TRAJECTORY_PRECISION: (
+        gapic_eval_service_types.TrajectoryPrecisionSpec()
+    ),
+    constants.Metric.TRAJECTORY_RECALL: (
+        gapic_eval_service_types.TrajectoryRecallSpec()
+    ),
+    constants.Metric.TRAJECTORY_SINGLE_TOOL_USE: (
+        gapic_eval_service_types.TrajectorySingleToolUseSpec()
+    ),
+}
+_QUESTION_TEMPLATE = """<question>{question}"""
+
+
+def _format_rubrics(questions: List[str]) -> str:
+    """Formats the list of rubrics into a question block."""
+    question_block = "\n".join(
+        _QUESTION_TEMPLATE.format(question=q.strip()) for q in questions
+    )
+    return question_block
+
+
+def build_custom_output_format_config(
+    custom_output_config: custom_output_config_class.CustomOutputConfig,
+) -> Union[gapic_eval_service_types.CustomOutputFormatConfig, None]:
+    """Builds a CustomOutputFormatConfig from user input."""
+    custom_output_cfg = gapic_eval_service_types.CustomOutputFormatConfig()
+    if custom_output_config.return_raw_output:
+        custom_output_cfg.return_raw_output = True
+        return custom_output_cfg
+    else:
+        return None
+
+
+def build_trajectory(
+    trajectory: Union[str, List[Dict[str, Any]]],
+) -> gapic_eval_service_types.Trajectory:
+    """Builds a trajectory from user input."""
+    if not trajectory:
+        return
+
+    if isinstance(trajectory, str):
+        trajectory = json.loads(trajectory)
+
+    if isinstance(trajectory, List):
+        try:
+            tool_calls = []
+            for tool_call_dict in trajectory:
+                tool_input_str = json.dumps(tool_call_dict["tool_input"])
+                tool_calls.append(
+                    gapic_eval_service_types.ToolCall(
+                        tool_name=tool_call_dict["tool_name"], tool_input=tool_input_str
+                    )
+                )
+            return gapic_eval_service_types.Trajectory(tool_calls=tool_calls)
+        except KeyError as e:
+            _LOGGER.error(f"Failed to parse trajectory: {e}")
+    else:
+        _LOGGER.error(
+            f"Unsupported trajectory type: {type(trajectory)}, expected list or"
+            " a JSON array."
+        )
+
+
+def build_request(
+    metric: Union[str, metrics_base._Metric],
+    row_dict: Dict[str, Any],
+    evaluation_run_config: eval_base.EvaluationRunConfig,
+) -> gapic_eval_service_types.EvaluateInstancesRequest:
+    """Builds a metric instance and form the request for the evaluation service.
+
+    Args:
+        metric: The name of the metric to evaluate.
+        row_dict: An evaluation dataset instance as a dictionary.
+        evaluation_run_config: Evaluation run configurations.
+
+    Returns:
+        A single EvaluateInstancesRequest.
+
+    Raises:
+        ValueError: If required request fields are not provided.
+    """
+    project = initializer.global_config.project
+    location = initializer.global_config.location
+    if not project or not location:
+        raise ValueError(
+            "No project or location specified. Please run `vertexai.init()` to"
+            " provide these parameters."
+        )
+    location_path = (
+        gapic_evaluation_services.EvaluationServiceClient.common_location_path(
+            project, location
+        )
+    )
+
+    if isinstance(metric, pointwise_metric.PointwiseMetric):
+        metric_name = constants.Metric.POINTWISE_METRIC
+    elif isinstance(metric, pairwise_metric.PairwiseMetric):
+        metric_name = constants.Metric.PAIRWISE_METRIC
+    else:
+        metric_name = str(metric)
+
+    try:
+        metric_spec = _METRIC_NAME_TO_METRIC_SPEC[metric_name]
+    except KeyError as e:
+        raise ValueError(f"Metric name: {metric_name} is not supported.") from e
+
+    model_based_metric_instance_input = {}
+    metric_column_mapping = evaluation_run_config.metric_column_mapping
+    if isinstance(
+        metric, metrics_base._ModelBasedMetric  # pylint: disable=protected-access
+    ):
+        metric_spec.metric_prompt_template = metric.metric_prompt_template
+        metric_spec.system_instruction = metric.system_instruction
+        if metric.custom_output_config:
+            metric_spec.custom_output_format_config = build_custom_output_format_config(
+                metric.custom_output_config
+            )
+        for variable in prompt_template_base.PromptTemplate(
+            metric.metric_prompt_template
+        ).variables:
+            model_based_metric_instance_input[variable] = row_dict.get(
+                metric_column_mapping.get(variable),
+                "",
+            )
+        if isinstance(metric, pairwise_metric.PairwiseMetric):
+            metric_column_mapping = evaluation_run_config.metric_column_mapping
+            metric_spec.candidate_response_field_name = metric_column_mapping.get(
+                constants.Dataset.MODEL_RESPONSE_COLUMN,
+                constants.Dataset.MODEL_RESPONSE_COLUMN,
+            )
+            metric_spec.baseline_response_field_name = metric_column_mapping.get(
+                constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
+                constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
+            )
+    elif isinstance(metric, _rouge.Rouge):
+        metric_spec.rouge_type = metric.rouge_type
+        metric_spec.use_stemmer = metric.use_stemmer
+        metric_spec.split_summaries = metric.split_summaries
+    elif isinstance(metric, _trajectory_single_tool_use.TrajectorySingleToolUse):
+        metric_spec.tool_name = metric.tool_name
+
+    response = row_dict.get(
+        metric_column_mapping.get(constants.Dataset.MODEL_RESPONSE_COLUMN), ""
+    )
+    reference = row_dict.get(
+        metric_column_mapping.get(constants.Dataset.REFERENCE_COLUMN), ""
+    )
+    predicted_trajectory = build_trajectory(
+        row_dict.get(
+            metric_column_mapping.get(constants.Dataset.PREDICTED_TRAJECTORY_COLUMN),
+            "",
+        )
+    )
+    reference_trajectory = build_trajectory(
+        row_dict.get(
+            metric_column_mapping.get(constants.Dataset.REFERENCE_TRAJECTORY_COLUMN),
+            "",
+        )
+    )
+    if isinstance(metric, metrics_base._ModelBasedMetric):
+        if metric_spec.metric_prompt_template in (
+            _default_templates.INSTRUCTION_FOLLOWING_RUBRIC_CRITIQUE_TEMPLATE,
+            _default_templates.MULTIMODAL_UNDERSTANDING_RUBRIC_CRITIQUE_TEMPLATE,
+            _default_templates.TEXT_QUALITY_RUBRIC_CRITIQUE_TEMPLATE,
+            _default_templates.PAIRWISE_INSTRUCTION_FOLLOWING_RUBRIC_CRITIQUE_TEMPLATE,
+            _default_templates.PAIRWISE_MULTIMODAL_UNDERSTANDING_RUBRIC_CRITIQUE_TEMPLATE,
+            _default_templates.PAIRWISE_TEXT_QUALITY_RUBRIC_CRITIQUE_TEMPLATE,
+        ):
+            model_based_metric_instance_input[
+                constants.Dataset.RUBRICS_COLUMN
+            ] = _format_rubrics(
+                model_based_metric_instance_input[constants.Dataset.RUBRICS_COLUMN]
+            )
+        if (
+            constants.Dataset.RUBRICS_COLUMN in model_based_metric_instance_input
+            and isinstance(
+                model_based_metric_instance_input[constants.Dataset.RUBRICS_COLUMN],
+                List,
+            )
+        ):
+            model_based_metric_instance_input[
+                constants.Dataset.RUBRICS_COLUMN
+            ] = "\n".join(
+                model_based_metric_instance_input[constants.Dataset.RUBRICS_COLUMN]
+            )
+
+    if metric_name == constants.Metric.EXACT_MATCH:
+        instance = gapic_eval_service_types.ExactMatchInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_eval_service_types.ExactMatchInstance(
+                    prediction=response,
+                    reference=reference,
+                )
+            ],
+        )
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            exact_match_input=instance,
+        )
+    elif metric_name == constants.Metric.BLEU:
+        instance = gapic_eval_service_types.BleuInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_eval_service_types.BleuInstance(
+                    prediction=response,
+                    reference=reference,
+                )
+            ],
+        )
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            bleu_input=instance,
+        )
+    elif metric_name in (
+        constants.Metric.ROUGE,
+        constants.Metric.ROUGE_1,
+        constants.Metric.ROUGE_2,
+        constants.Metric.ROUGE_L,
+        constants.Metric.ROUGE_L_SUM,
+    ):
+        instance = gapic_eval_service_types.RougeInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_eval_service_types.RougeInstance(
+                    prediction=response,
+                    reference=reference,
+                )
+            ],
+        )
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            rouge_input=instance,
+        )
+    elif metric_name == constants.Metric.TOOL_CALL_VALID:
+        instance = gapic_eval_service_types.ToolCallValidInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_eval_service_types.ToolCallValidInstance(
+                    prediction=response,
+                    reference=reference,
+                )
+            ],
+        )
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            tool_call_valid_input=instance,
+        )
+    elif metric_name == constants.Metric.TOOL_NAME_MATCH:
+        instance = gapic_eval_service_types.ToolNameMatchInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_eval_service_types.ToolNameMatchInstance(
+                    prediction=response,
+                    reference=reference,
+                )
+            ],
+        )
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            tool_name_match_input=instance,
+        )
+    elif metric_name == constants.Metric.TOOL_PARAMETER_KEY_MATCH:
+        instance = gapic_eval_service_types.ToolParameterKeyMatchInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_eval_service_types.ToolParameterKeyMatchInstance(
+                    prediction=response,
+                    reference=reference,
+                )
+            ],
+        )
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            tool_parameter_key_match_input=instance,
+        )
+    elif metric_name == constants.Metric.TOOL_PARAMETER_KV_MATCH:
+        instance = gapic_eval_service_types.ToolParameterKVMatchInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_eval_service_types.ToolParameterKVMatchInstance(
+                    prediction=response,
+                    reference=reference,
+                )
+            ],
+        )
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            tool_parameter_kv_match_input=instance,
+        )
+    elif metric_name == constants.Metric.POINTWISE_METRIC:
+        if multimodal_utils.is_multimodal_instance(model_based_metric_instance_input):
+            instance = gapic_eval_service_types.PointwiseMetricInput(
+                metric_spec=metric_spec,
+                instance=gapic_eval_service_types.PointwiseMetricInstance(
+                    content_map_instance=multimodal_utils.convert_multimodal_response_to_content_map(
+                        model_based_metric_instance_input
+                    ),
+                ),
+            )
+        else:
+            instance = gapic_eval_service_types.PointwiseMetricInput(
+                metric_spec=metric_spec,
+                instance=gapic_eval_service_types.PointwiseMetricInstance(
+                    json_instance=json.dumps(model_based_metric_instance_input),
+                ),
+            )
+        autorater_config = evaluation_run_config.autorater_config
+        if (
+            isinstance(metric, metrics_base._ModelBasedMetric)
+            and metric.autorater_config
+        ):
+            autorater_config = metric.autorater_config
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            pointwise_metric_input=instance,
+            autorater_config=autorater_config,
+        )
+    elif metric_name == constants.Metric.PAIRWISE_METRIC:
+        if multimodal_utils.is_multimodal_instance(model_based_metric_instance_input):
+            instance = gapic_eval_service_types.PairwiseMetricInput(
+                metric_spec=metric_spec,
+                instance=gapic_eval_service_types.PairwiseMetricInstance(
+                    content_map_instance=multimodal_utils.convert_multimodal_response_to_content_map(
+                        model_based_metric_instance_input
+                    ),
+                ),
+            )
+        else:
+            instance = gapic_eval_service_types.PairwiseMetricInput(
+                metric_spec=metric_spec,
+                instance=gapic_eval_service_types.PairwiseMetricInstance(
+                    json_instance=json.dumps(model_based_metric_instance_input),
+                ),
+            )
+        autorater_config = evaluation_run_config.autorater_config
+        if (
+            isinstance(metric, metrics_base._ModelBasedMetric)
+            and metric.autorater_config
+        ):
+            autorater_config = metric.autorater_config
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            pairwise_metric_input=instance,
+            autorater_config=autorater_config,
+        )
+    elif metric_name == constants.Metric.RUBRIC_BASED_INSTRUCTION_FOLLOWING:
+        required_rbif_fields = [
+            constants.Dataset.MODEL_RESPONSE_COLUMN,
+            constants.Dataset.PROMPT_COLUMN,
+        ]
+        for field in required_rbif_fields:
+            column_name = metric_column_mapping.get(field)
+            value = row_dict.get(column_name)
+            if value is None and field in required_rbif_fields:
+                raise ValueError(
+                    f"Missing required field: `{field}` for "
+                    f"{constants.Metric.RUBRIC_BASED_INSTRUCTION_FOLLOWING}."
+                )
+            else:
+                model_based_metric_instance_input[field] = value
+        instance = gapic_eval_service_types.RubricBasedInstructionFollowingInput(
+            metric_spec=metric_spec,
+            instance=gapic_eval_service_types.RubricBasedInstructionFollowingInstance(
+                json_instance=json.dumps(model_based_metric_instance_input),
+            ),
+        )
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            rubric_based_instruction_following_input=instance,
+        )
+    elif metric_name == constants.Metric.TRAJECTORY_EXACT_MATCH:
+        instance = gapic_eval_service_types.TrajectoryExactMatchInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_eval_service_types.TrajectoryExactMatchInstance(
+                    predicted_trajectory=predicted_trajectory,
+                    reference_trajectory=reference_trajectory,
+                )
+            ],
+        )
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            trajectory_exact_match_input=instance,
+        )
+    elif metric_name == constants.Metric.TRAJECTORY_IN_ORDER_MATCH:
+        instance = gapic_eval_service_types.TrajectoryInOrderMatchInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_eval_service_types.TrajectoryInOrderMatchInstance(
+                    predicted_trajectory=predicted_trajectory,
+                    reference_trajectory=reference_trajectory,
+                )
+            ],
+        )
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            trajectory_in_order_match_input=instance,
+        )
+    elif metric_name == constants.Metric.TRAJECTORY_ANY_ORDER_MATCH:
+        instance = gapic_eval_service_types.TrajectoryAnyOrderMatchInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_eval_service_types.TrajectoryAnyOrderMatchInstance(
+                    predicted_trajectory=predicted_trajectory,
+                    reference_trajectory=reference_trajectory,
+                )
+            ],
+        )
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            trajectory_any_order_match_input=instance,
+        )
+    elif metric_name == constants.Metric.TRAJECTORY_PRECISION:
+        instance = gapic_eval_service_types.TrajectoryPrecisionInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_eval_service_types.TrajectoryPrecisionInstance(
+                    predicted_trajectory=predicted_trajectory,
+                    reference_trajectory=reference_trajectory,
+                )
+            ],
+        )
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            trajectory_precision_input=instance,
+        )
+    elif metric_name == constants.Metric.TRAJECTORY_RECALL:
+        instance = gapic_eval_service_types.TrajectoryRecallInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_eval_service_types.TrajectoryRecallInstance(
+                    predicted_trajectory=predicted_trajectory,
+                    reference_trajectory=reference_trajectory,
+                )
+            ],
+        )
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            trajectory_recall_input=instance,
+        )
+    elif metric_name == constants.Metric.TRAJECTORY_SINGLE_TOOL_USE:
+        instance = gapic_eval_service_types.TrajectorySingleToolUseInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_eval_service_types.TrajectorySingleToolUseInstance(
+                    predicted_trajectory=predicted_trajectory,
+                )
+            ],
+        )
+        return gapic_eval_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            trajectory_single_tool_use_input=instance,
+        )
+    else:
+        raise ValueError(f"Unknown metric type: {metric_name}")
+
+
+def _parse_autometric_results(
+    metric_result_dict: Dict[str, Any],
+) -> Dict[str, Any]:
+    """Parses the automatic metric results from the evaluation results.
+
+    Args:
+        metric_result_dict: The metric results dictionary.
+
+    Returns:
+        A dictionary containing metric score of the metric.
+    """
+    for value in metric_result_dict.values():
+        return {
+            constants.MetricResult.SCORE_KEY: value[0].get(
+                constants.MetricResult.SCORE_KEY
+            )
+        }
+
+
+def _parse_pointwise_results(
+    metric_result_dict: Dict[str, Any],
+    metric: Union[str, metrics_base._Metric],
+) -> Dict[str, Any]:
+    """Parses the model-based pointwise metric results from the evaluation results.
+
+    Args:
+        metric_result_dict: The metric results dictionary.
+        metric: The metric to evaluate.
+
+    Returns:
+        One of the following:
+        1. A dictionary containing raw outputs from the judge model if
+        return_raw_output is set to True in custom_output_config.
+        2. A dictionary containing metric score and explanation of the
+        metric if custom_output_config is not set.
+    """
+    if (
+        isinstance(metric, pointwise_metric.PointwiseMetric)
+        and getattr(metric, "custom_output_config", None)
+        and getattr(metric.custom_output_config, "return_raw_output", False)
+    ):
+        raw_outputs = (
+            metric_result_dict.get(constants.MetricResult.CUSTOM_OUTPUT_KEY)
+            .get(constants.MetricResult.RAW_OUTPUTS_KEY)
+            .get(constants.MetricResult.RAW_OUTPUT_KEY)
+        )
+        if (
+            isinstance(metric, pointwise_metric.PointwiseMetric)
+            and getattr(metric, "custom_output_config", None)
+            and getattr(metric.custom_output_config, "parsing_fn", None)
+        ):
+            parsing_fn = metric.custom_output_config.parsing_fn
+            return parsing_fn(raw_outputs)
+        return {constants.MetricResult.RAW_OUTPUT_KEY: raw_outputs}
+    else:
+        return {
+            constants.MetricResult.SCORE_KEY: metric_result_dict.get(
+                constants.MetricResult.SCORE_KEY
+            ),
+            constants.MetricResult.EXPLANATION_KEY: metric_result_dict.get(
+                constants.MetricResult.EXPLANATION_KEY
+            ),
+        }
+
+
+def _parse_pairwise_results(
+    metric_result_dict: Dict[str, Any],
+    metric: Union[str, metrics_base._Metric],
+) -> Dict[str, Any]:
+    """Parses the pairwise metric results from the evaluation results.
+
+    Args:
+        metric_result_dict: The metric results dictionary.
+        metric: The metric to evaluate.
+
+    Returns:
+        One of the following:
+        1. A dictionary containing raw outputs from the judge model if
+        return_raw_output is set to True in custom_output_config.
+        2. A dictionary containing metric score and explanation of the
+        metric if custom_output_config is not set.
+    """
+    if (
+        isinstance(metric, pairwise_metric.PairwiseMetric)
+        and getattr(metric, "custom_output_config", None)
+        and getattr(metric.custom_output_config, "return_raw_output", False)
+    ):
+        raw_outputs = (
+            metric_result_dict.get(constants.MetricResult.CUSTOM_OUTPUT_KEY)
+            .get(constants.MetricResult.RAW_OUTPUTS_KEY)
+            .get(constants.MetricResult.RAW_OUTPUT_KEY)
+        )
+        if (
+            isinstance(metric, pairwise_metric.PairwiseMetric)
+            and getattr(metric, "custom_output_config", None)
+            and getattr(metric.custom_output_config, "parsing_fn", None)
+        ):
+            parsing_fn = metric.custom_output_config.parsing_fn
+            return parsing_fn(raw_outputs)
+        return {constants.MetricResult.RAW_OUTPUT_KEY: raw_outputs}
+    else:
+        return {
+            constants.MetricResult.PAIRWISE_CHOICE_KEY: metric_result_dict.get(
+                constants.MetricResult.PAIRWISE_CHOICE_KEY,
+            ),
+            constants.MetricResult.EXPLANATION_KEY: metric_result_dict.get(
+                constants.MetricResult.EXPLANATION_KEY
+            ),
+        }
+
+
+def _parse_rubric_based_instruction_following_results(
+    metric_result_dict: Dict[str, Any],
+) -> Dict[str, Any]:
+    """Parses the rubric-based instruction following metric results from the evaluation results.
+
+    Args:
+        metric_result_dict: The metric results dictionary.
+
+    Returns:
+        A dictionary containing a list of rubrics and corresponding verdicts and
+        an overall instruction following score.
+    """
+    rubric_critique_results = []
+    for rc_result in metric_result_dict["rubric_critique_results"]:
+        if "verdict" not in rc_result:
+            rc_result["verdict"] = False  # proto3 shows False bool as unset
+        rubric_critique_results.append(
+            {
+                "rubric": rc_result["rubric"],
+                "verdict": rc_result["verdict"],
+            }
+        )
+    return {
+        constants.MetricResult.RUBRIC_LEVEL_INSTRUCTION_FOLLOWING_KEY: (
+            rubric_critique_results
+        ),
+        constants.MetricResult.SCORE_KEY: (
+            metric_result_dict.get(constants.MetricResult.SCORE_KEY)
+        ),
+    }
+
+
+def handle_response(
+    response: Union[str, gapic_eval_service_types.EvaluateInstancesResponse],
+    metric: Union[str, metrics_base._Metric],
+) -> Union[str, Dict[str, Any]]:
+    """Handles the response from the evaluation service.
+
+    Args:
+        response: The response from the evaluation service.
+        metric: The metric to evaluate to check the output type.
+
+    Returns:
+        A parsed metric result dictionary, or an error message string.
+    """
+    if isinstance(response, str):
+        return response
+
+    metric_type = response._pb.WhichOneof(  # pylint: disable=protected-access
+        "evaluation_results"
+    )
+
+    if metric_type == constants.MetricResult.EXACT_MATCH_RESULTS:
+        metric_result = response.exact_match_results
+    elif metric_type == constants.MetricResult.BLEU_RESULTS:
+        metric_result = response.bleu_results
+    elif metric_type == constants.MetricResult.ROUGE_RESULTS:
+        metric_result = response.rouge_results
+    elif metric_type == constants.MetricResult.TOOL_CALL_VALID_RESULTS:
+        metric_result = response.tool_call_valid_results
+    elif metric_type == constants.MetricResult.TOOL_NAME_MATCH_RESULTS:
+        metric_result = response.tool_name_match_results
+    elif metric_type == constants.MetricResult.TOOL_PARAMETER_KEY_MATCH_RESULTS:
+        metric_result = response.tool_parameter_key_match_results
+    elif metric_type == constants.MetricResult.TOOL_PARAMETER_KV_MATCH_RESULTS:
+        metric_result = response.tool_parameter_kv_match_results
+    elif metric_type == constants.MetricResult.POINTWISE_METRIC_RESULT:
+        metric_result = response.pointwise_metric_result
+    elif metric_type == constants.MetricResult.PAIRWISE_METRIC_RESULT:
+        metric_result = response.pairwise_metric_result
+    elif metric_type == constants.MetricResult.TRAJECTORY_EXACT_MATCH_RESULTS:
+        metric_result = response.trajectory_exact_match_results
+    elif metric_type == constants.MetricResult.TRAJECTORY_IN_ORDER_MATCH_RESULTS:
+        metric_result = response.trajectory_in_order_match_results
+    elif metric_type == constants.MetricResult.TRAJECTORY_ANY_ORDER_MATCH_RESULTS:
+        metric_result = response.trajectory_any_order_match_results
+    elif metric_type == constants.MetricResult.TRAJECTORY_PRECISION_RESULTS:
+        metric_result = response.trajectory_precision_results
+    elif metric_type == constants.MetricResult.TRAJECTORY_RECALL_RESULTS:
+        metric_result = response.trajectory_recall_results
+    elif metric_type == constants.MetricResult.TRAJECTORY_SINGLE_TOOL_USE_RESULTS:
+        metric_result = response.trajectory_single_tool_use_results
+    elif (
+        metric_type == constants.MetricResult.RUBRIC_BASED_INSTRUCTION_FOLLOWING_RESULT
+    ):
+        metric_result = response.rubric_based_instruction_following_result
+    else:
+        raise ValueError(f"Unknown metric type: {metric_type}")
+
+    metric_result_dict = json_format.MessageToDict(
+        metric_result._pb,  # pylint: disable=protected-access
+        preserving_proto_field_name=True,
+    )
+    if metric_type in (constants.MetricResult.AUTOMATIC_METRIC_RESULTS_LIST):
+        result = _parse_autometric_results(metric_result_dict)
+    elif metric_type == constants.MetricResult.POINTWISE_METRIC_RESULT:
+        result = _parse_pointwise_results(metric_result_dict, metric)
+    elif metric_type == constants.MetricResult.PAIRWISE_METRIC_RESULT:
+        result = _parse_pairwise_results(metric_result_dict, metric)
+    elif (
+        metric_type == constants.MetricResult.RUBRIC_BASED_INSTRUCTION_FOLLOWING_RESULT
+    ):
+        result = _parse_rubric_based_instruction_following_results(metric_result_dict)
+    else:
+        raise ValueError(f"Unknown metric type: {metric_type}")
+    return result
+
+
+def evaluate_instances(
+    client: gapic_evaluation_services.EvaluationServiceClient,
+    request: gapic_eval_service_types.EvaluateInstancesRequest,
+    rate_limiter: utils.RateLimiter,
+    retry_timeout: float,
+) -> gapic_eval_service_types.EvaluateInstancesResponse:
+    """Evaluates an instance using Vertex Gen AI Evaluation Service.
+
+    Args:
+        client: The Vertex Gen AI evaluation service client for evaluation.
+        request: An EvaluateInstancesRequest.
+        rate_limiter: The rate limiter for evaluation service requests.
+        retry_timeout: How long to keep retrying the evaluation requests, in seconds.
+
+    Returns:
+        An EvaluateInstancesResponse from Vertex Gen AI Evaluation Service.
+    """
+    rate_limiter.sleep_and_advance()
+    return client.evaluate_instances(
+        request=request,
+        retry=api_core.retry.Retry(
+            initial=0.250,
+            maximum=90.0,
+            multiplier=1.45,
+            timeout=retry_timeout,
+            predicate=api_core.retry.if_exception_type(
+                api_core.exceptions.Aborted,
+                api_core.exceptions.DeadlineExceeded,
+                api_core.exceptions.ResourceExhausted,
+                api_core.exceptions.ServiceUnavailable,
+                api_core.exceptions.Cancelled,
+            ),
+        ),
+    )
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/_rouge.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/_rouge.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""ROUGE Metric."""
+
+from typing import Literal
+from vertexai.preview.evaluation import constants
+from vertexai.preview.evaluation.metrics import _base
+
+
+class Rouge(_base._AutomaticMetric):  # pylint: disable=protected-access
+    """The ROUGE Metric.
+
+    Calculates the recall of n-grams in prediction as compared to reference and
+    returns a score ranging between 0 and 1. Supported rouge types are
+    rougen[1-9], rougeL, and rougeLsum.
+    """
+
+    _metric_name = constants.Metric.ROUGE
+
+    def __init__(
+        self,
+        *,
+        rouge_type: Literal[
+            "rouge1",
+            "rouge2",
+            "rouge3",
+            "rouge4",
+            "rouge5",
+            "rouge6",
+            "rouge7",
+            "rouge8",
+            "rouge9",
+            "rougeL",
+            "rougeLsum",
+        ],
+        use_stemmer: bool = False,
+        split_summaries: bool = False
+    ):
+        """Initializes the ROUGE metric.
+
+        Args:
+          rouge_type: Supported rouge types are rougen[1-9], rougeL, and rougeLsum.
+          use_stemmer: Whether to use stemmer to compute rouge score.
+          split_summaries: Whether to split summaries while using 'rougeLsum' to
+            compute rouge score.
+        """
+        self._rouge_type = rouge_type
+        self._use_stemmer = use_stemmer
+        self._split_summaries = split_summaries
+
+        super().__init__(
+            metric=Rouge._metric_name,
+        )
+
+    @property
+    def rouge_type(self) -> str:
+        return self._rouge_type
+
+    @property
+    def use_stemmer(self) -> bool:
+        return self._use_stemmer
+
+    @property
+    def split_summaries(self) -> bool:
+        return self._split_summaries
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/_schema.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/_schema.py
@@ -0,0 +1,148 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Schema for autorater metric configuration."""
+
+AUTORATER_METRIC_SCHEMA = """
+  $schema: https://json-schema.org/draft/2020-12/schema
+  title: AutoRater Metric Configuration
+  description: A metric definition for model-based evaluation.
+  type: object
+  properties:
+    metadata:
+      description: Useful information about the metric.
+      type: object
+      properties:
+        name:
+          description: Name of the metric.
+          type: string
+        description:
+          description: Description of the metric.
+          type: string
+        author:
+          description: Author of the metric.
+          type: string
+        contact:
+          description: PoC for the metric.
+          type: string
+        version:
+          description: Version of the metric.
+          type: string
+        classification:
+          description: Classification of the metric.
+          type: string
+          enum:
+            - experimental
+            - benchmarked
+            - deprecated
+        required_inputs:
+          description: Input fields used in the metric prompt template.
+          type: array
+          items:
+            type: string
+          minItems: 1
+          uniqueItems: true
+        benchmarks:
+          description: List of benchmarks used for the metric.
+          type: array
+          items:
+            type: object
+            properties:
+              dataset:
+                description: Dataset used for benchmarking.
+                type: string
+              results:
+                description: Results from benchmarking.
+                type: string
+            required:
+              - results
+          minItems: 1
+          uniqueItems: true
+        usage:
+          description: Links to documentation or notebooks with example usage.
+          type: array
+          items:
+            type: string
+          minItems: 1
+          uniqueItems: true
+      required:
+        - name
+        - version
+        - required_inputs
+    steps:
+      description: List of steps used for the autorater workflow.
+      type: array
+      items:
+        type: object
+        properties:
+          type:
+            description: Type of the step.
+            type: string
+            enum:
+              - pointwise_metric
+              - pairwise_metric
+              - rubric
+          prompt:
+            description: Prompt template for the step.
+            type: object
+            properties:
+              system_instruction:
+                description: System instruction for the model.
+                type: string
+              template:
+                description: Template to populate with inputs from the dataset.
+                type: string
+            required:
+              - template
+          model:
+            description: Configuration of the model for the step.
+            type: object
+            properties:
+              model_name_or_endpoint:
+                description: Name or endpoint of the model.
+                type: string
+            required:
+              - model_name_or_endpoint
+          options:
+            description: Options for the step.
+            type: object
+            properties:
+              sample_count:
+                description: Number of samples for each instance in the dataset.
+                type: integer
+              flip_enabled:
+                description: Whether to flip candidate and baseline responses.
+                type: boolean
+          output:
+            description: Output of the step.
+            type: object
+            properties:
+              type:
+                description: Type of the output.
+                type: string
+                enum:
+                  - raw
+            required:
+              - type
+        required:
+          - type
+          - prompt
+      minItems: 1
+      uniqueItems: true
+  required:
+    - metadata
+    - steps
+"""
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/_trajectory_single_tool_use.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/_trajectory_single_tool_use.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from vertexai.preview.evaluation import constants
+from vertexai.preview.evaluation.metrics import _base
+
+
+class TrajectorySingleToolUse(
+    _base._AutomaticMetric
+):  # pylint: disable=protected-access
+    """The TrajectorySingleToolUse Metric.
+
+    Evaluates if a tool is present in the trajectory or not.
+    """
+
+    _metric_name = constants.Metric.TRAJECTORY_SINGLE_TOOL_USE
+
+    def __init__(
+        self,
+        tool_name: str,
+    ):
+        """Initializes the TrajectorySingleToolUse metric.
+
+        Args:
+          tool_name: name of the tool to check.
+        """
+        self._tool_name = tool_name
+
+        super().__init__(
+            metric=TrajectorySingleToolUse._metric_name,
+        )
+
+    @property
+    def tool_name(self) -> str:
+        return self._tool_name
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/custom_output_config.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/custom_output_config.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Custom output config for model-based metrics."""
+
+from typing import Any, Callable, Dict, Optional
+
+
+class CustomOutputConfig:
+    """Custom output config for model-based metrics.
+
+    Attributes:
+        return_raw_output: Whether to return the raw output of the metric
+            function.
+        parsing_fn: Function to parse the raw output of the metric.
+    """
+
+    def __init__(
+        self,
+        return_raw_output: bool = False,
+        parsing_fn: Optional[Callable[[str], Dict[str, Any]]] = None,
+    ):
+        """Initializes CustomOutputConfig."""
+        self.return_raw_output = return_raw_output
+        self.parsing_fn = parsing_fn
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/metric_prompt_template.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/metric_prompt_template.py
@@ -0,0 +1,395 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Metric prompt template classes for model-based metrics evaluation."""
+
+from typing import Dict, List, Optional
+
+from google.cloud.aiplatform import base
+from vertexai.preview.evaluation import (
+    prompt_template,
+)
+
+
+_LOGGER = base.Logger(__name__)
+_NEWLINE = "\n"
+
+
+def serialize_dict_in_order(elements: Optional[Dict[str, str]]):
+    """Serializes dictionary to ordered string value without brackets."""
+    if elements is None:
+        return ""
+    return _NEWLINE.join(f"{key}: {value}" for key, value in sorted(elements.items()))
+
+
+class _MetricPromptTemplate(prompt_template.PromptTemplate):
+    """Metric prompt template for generic model-based metrics evaluation."""
+
+    def __init__(
+        self,
+        *,
+        criteria: Dict[str, str],
+        rating_rubric: Dict[str, str],
+        input_variables: List[str],
+        instruction: Optional[str] = None,
+        evaluation_steps: Optional[Dict[str, str]] = None,
+        metric_definition: Optional[str] = None,
+        few_shot_examples: Optional[List[str]] = None,
+    ):
+        """Initializes a metric prompt template."""
+        self._input_variables = input_variables
+
+        self._instruction = instruction
+        self._metric_definition = metric_definition
+        self._criteria = criteria
+        self._rating_rubric = rating_rubric
+        self._evaluation_steps = evaluation_steps
+        self._few_shot_examples = few_shot_examples
+
+        self.template = self.__str__()
+
+    @property
+    def prompt_data(self) -> str:
+        return self.template
+
+
+class PointwiseMetricPromptTemplate(_MetricPromptTemplate):
+    """Pointwise metric prompt template for pointwise model-based metrics."""
+
+    def __init__(
+        self,
+        *,
+        criteria: Dict[str, str],
+        rating_rubric: Dict[str, str],
+        input_variables: Optional[List[str]] = None,
+        instruction: Optional[str] = None,
+        metric_definition: Optional[str] = None,
+        evaluation_steps: Optional[Dict[str, str]] = None,
+        few_shot_examples: Optional[List[str]] = None,
+    ):
+        """Initializes a pointwise metric prompt template.
+
+        Args:
+            criteria: The standards and measures used to evaluate the model
+              responses. It is a dictionary of criterion names and criterion
+              definitions.
+            rating_rubric: A dictionary mapping of rating name and rating
+              definition, used to assign ratings or scores based on specific
+              criteria.
+            input_variables: An optional list of input fields to use in the metric
+              prompt template for generating model-based evaluation results. Model
+              "response" column is included by default. If metric_column_mapping is
+              provided, the mapping values of the input fields will be used to
+              retrieve data from the evaluation dataset.
+            instruction: The general instruction to the model that performs the
+              evaluation. If not provided, a default pointwise metric instruction
+              will be used.
+            metric_definition: The optional metric definition. It is a string
+              describing the metric to be evaluated at a high level. If not
+              provided, this field will not be included in the prompt template.
+            evaluation_steps: The optional gudelines of evaluation steps. A
+              dictionary of evaluation step name and evaluation step definition. If
+              not provided, a default pointwise metric evaluation steps will be
+              used.
+            few_shot_examples: The optional list of few-shot examples to be used in
+              the prompt, to provide the model with demonstrations of how to perform
+              the evaluation, and improve the evaluation accuracy. If not provided,
+              this field will not be included in the prompt template.
+        """
+        if not input_variables:
+            input_variables = []
+            _LOGGER.info(
+                "The `input_variables` parameter is empty. Only the `response`"
+                " column is used for computing this model-based metric."
+            )
+        input_variables = list(set(input_variables + ["response"]))
+
+        instruction = instruction or self.get_default_pointwise_instruction()
+
+        evaluation_steps = (
+            evaluation_steps or self.get_default_pointwise_evaluation_steps()
+        )
+
+        super().__init__(
+            input_variables=input_variables,
+            criteria=criteria,
+            rating_rubric=rating_rubric,
+            instruction=instruction,
+            metric_definition=metric_definition,
+            evaluation_steps=evaluation_steps,
+            few_shot_examples=few_shot_examples,
+        )
+
+    def get_default_pointwise_instruction(self) -> str:
+        """Returns the default instruction for the metric prompt template."""
+
+        return (
+            "You are an expert evaluator. Your task is to evaluate the quality of"
+            " the responses generated by AI models. We will provide you with the"
+            " user prompt and an AI-generated responses.\nYou should first read"
+            " the user input carefully for analyzing the task, and then evaluate"
+            " the quality of the responses based on the Criteria provided in the"
+            " Evaluation section below.\nYou will assign the response a rating"
+            " following the Rating Rubric and Evaluation Steps. Give step by step"
+            " explanations for your rating, and only choose ratings from the Rating"
+            " Rubric."
+        )
+
+    def get_default_pointwise_evaluation_steps(self) -> Dict[str, str]:
+        """Returns the default evaluation steps for the metric prompt template."""
+        return {
+            "Step 1": (
+                "Assess the response in aspects of all criteria provided. Provide"
+                " assessment according to each criterion."
+            ),
+            "Step 2": (
+                "Score based on the rating rubric. Give a brief rationale to"
+                " explain your evaluation considering each individual criterion."
+            ),
+        }
+
+    def __str__(self):
+        """Serializes the pointwise metric prompt template to a string."""
+        metric_prompt_template_str = [
+            "# Instruction",
+            f"{self._instruction}",
+            _NEWLINE,
+            "# Evaluation",
+        ]
+        if self._metric_definition:
+            metric_prompt_template_str.extend(
+                [
+                    "## Metric Definition",
+                    f"{self._metric_definition}\n",
+                ]
+            )
+        metric_prompt_template_str.extend(
+            [
+                "## Criteria",
+                f"{serialize_dict_in_order(self._criteria)}\n",
+                "## Rating Rubric",
+                f"{serialize_dict_in_order(self._rating_rubric)}\n",
+            ]
+        )
+        if self._evaluation_steps:
+            metric_prompt_template_str.extend(
+                [
+                    "## Evaluation Steps",
+                    f"{serialize_dict_in_order(self._evaluation_steps)}\n",
+                ]
+            )
+        if self._few_shot_examples:
+            metric_prompt_template_str.extend(
+                [
+                    "## Evaluation Examples",
+                    f"{_NEWLINE.join(self._few_shot_examples)}\n",
+                ]
+            )
+        metric_prompt_template_str.extend(
+            ["\n# User Inputs and AI-generated Response", "## User Inputs"]
+        )
+        for input_variable in self._input_variables:
+            if input_variable == "response":
+                continue
+            metric_prompt_template_str.extend(
+                [
+                    f"### {input_variable}",
+                    f"{{{input_variable}}}\n",
+                ]
+            )
+        metric_prompt_template_str.extend(
+            [
+                _NEWLINE,
+                "\n## AI-generated Response",
+                "{response}",
+            ]
+        )
+        return _NEWLINE.join(metric_prompt_template_str)
+
+    def __repr__(self):
+        return (
+            f"PointwiseMetricPromptTemplate(prompt_data={self.prompt_data},"
+            f" variables={self.variables})"
+        )
+
+
+class PairwiseMetricPromptTemplate(_MetricPromptTemplate):
+    """Pairwise metric prompt template for pairwise model-based metrics."""
+
+    def __init__(
+        self,
+        *,
+        criteria: Dict[str, str],
+        rating_rubric: Dict[str, str],
+        input_variables: Optional[List[str]] = None,
+        instruction: Optional[str] = None,
+        metric_definition: Optional[str] = None,
+        evaluation_steps: Optional[Dict[str, str]] = None,
+        few_shot_examples: Optional[List[str]] = None,
+    ):
+        """Initializes a pairwise metric prompt template.
+
+        Args:
+            criteria: The standards and measures used to evaluate the model
+              responses. It is a dictionary of criterion names and criterion
+              definitions.
+            rating_rubric: A dictionary mapping of rating name and rating
+              definition, used to assign ratings or scores based on specific
+              criteria.
+            input_variables: An optional list of input fields to use in the metric
+              prompt template for generating model-based evaluation results.
+              Candidate model "response" column and "baseline_model_response" column
+              are included by default. If metric_column_mapping is provided, the
+              mapping values of the input fields will be used to retrieve data from
+              the evaluation dataset.
+            instruction: The general instruction to the model that performs the
+              evaluation. If not provided, a default pairwise metric instruction
+              will be used.
+            metric_definition: The optional metric definition. It is a string
+              describing the metric to be evaluated at a high level. If not
+              provided, this field will not be included in the prompt template.
+            evaluation_steps: The optional gudelines of evaluation steps. A
+              dictionary of evaluation step name and evaluation step definition. If
+              not provided, a default pairwise metric evaluation steps will be used.
+            few_shot_examples: The optional list of few-shot examples to be used in
+              the prompt, to provide the model with demonstrations of how to perform
+              the evaluation, and improve the evaluation accuracy. If not provided,
+              this field will not be included in the prompt template.
+        """
+        if not input_variables:
+            input_variables = []
+            _LOGGER.info(
+                "The `input_variables` parameter is empty. Only the `response`"
+                " column and `baseline_model_response` columns are used for"
+                " computing this model-based metric."
+            )
+        input_variables = list(
+            set(input_variables + ["response", "baseline_model_response"])
+        )
+
+        instruction = instruction or self.get_default_pairwise_instruction()
+
+        evaluation_steps = (
+            evaluation_steps or self.get_default_pairwise_evaluation_steps()
+        )
+
+        super().__init__(
+            input_variables=input_variables,
+            criteria=criteria,
+            rating_rubric=rating_rubric,
+            instruction=instruction,
+            metric_definition=metric_definition,
+            evaluation_steps=evaluation_steps,
+            few_shot_examples=few_shot_examples,
+        )
+
+    def get_default_pairwise_instruction(self) -> str:
+        """Returns the default instruction for the metric prompt template."""
+
+        return (
+            "You are an expert evaluator. Your task is to evaluate the quality of"
+            " the responses generated by two AI models. We will provide you with"
+            " the user input and a pair of AI-generated responses (Response A and"
+            " Response B).\nYou should first read the user input carefully for"
+            " analyzing the task, and then evaluate the quality of the responses"
+            " based on based on the Criteria provided in the Evaluation section"
+            " below.\nYou will first judge responses individually, following the"
+            " Rating Rubric and Evaluation Steps. Then you will give step by step"
+            " explanations for your judgement, compare results to declare the"
+            " winner based on the Rating Rubric and Evaluation Steps."
+        )
+
+    def get_default_pairwise_evaluation_steps(self) -> Dict[str, str]:
+        """Returns the default evaluation steps for the metric prompt template."""
+        return {
+            "Step 1": "Analyze Response A based on all the Criteria.",
+            "Step 2": "Analyze Response B based on all the Criteria.",
+            "Step 3": (
+                "Compare the overall performance of Response A and Response B based"
+                " on your analyses and assessment."
+            ),
+            "Step 4": (
+                'Output your preference of "A", "SAME" or "B" to the'
+                " pairwise_choice field according to the Rating Rubrics."
+            ),
+            "Step 5": "Output your assessment reasoning in the explanation field",
+        }
+
+    def __str__(self):
+        """Serializes the pairwise metric prompt template to a string."""
+        metric_prompt_template_str = [
+            "# Instruction",
+            f"{self._instruction}",
+            _NEWLINE,
+            "# Evaluation",
+        ]
+        if self._metric_definition:
+            metric_prompt_template_str.extend(
+                [
+                    "## Metric Definition",
+                    f"{self._metric_definition}\n",
+                ]
+            )
+        metric_prompt_template_str.extend(
+            [
+                "## Criteria",
+                f"{serialize_dict_in_order(self._criteria)}\n",
+                "## Rating Rubric",
+                f"{serialize_dict_in_order(self._rating_rubric)}\n",
+            ]
+        )
+        if self._evaluation_steps:
+            metric_prompt_template_str.extend(
+                [
+                    "## Evaluation Steps",
+                    f"{serialize_dict_in_order(self._evaluation_steps)}\n",
+                ]
+            )
+        if self._few_shot_examples:
+            metric_prompt_template_str.extend(
+                [
+                    "## Evaluation Examples",
+                    f"{_NEWLINE.join(self._few_shot_examples)}\n",
+                ]
+            )
+        metric_prompt_template_str.extend(
+            ["\n# User Inputs and AI-generated Responses", "## User Inputs"]
+        )
+        for input_variable in self._input_variables:
+            if input_variable in ["response", "baseline_model_response"]:
+                continue
+            metric_prompt_template_str.extend(
+                [
+                    f"### {input_variable}",
+                    f"{{{input_variable}}}\n",
+                ]
+            )
+        metric_prompt_template_str.extend(
+            [
+                "\n## AI-generated Responses",
+                "### Response A",
+                "{baseline_model_response}\n",
+                "### Response B",
+                "{response}",
+            ]
+        )
+        return _NEWLINE.join(metric_prompt_template_str)
+
+    def __repr__(self):
+        return (
+            f"PairwiseMetricPromptTemplate(prompt_data={self.prompt_data},"
+            f" variables={self.variables})"
+        )
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/metric_prompt_template_examples.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/metric_prompt_template_examples.py
@@ -0,0 +1,197 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Example metric prompt templates for model-based evaluation."""
+
+from typing import List
+
+from vertexai.preview.evaluation import constants
+from vertexai.preview.evaluation.metrics import (
+    _default_templates,
+)
+from vertexai.preview.evaluation.metrics import pairwise_metric
+from vertexai.preview.evaluation.metrics import pointwise_metric
+
+
+class MetricPromptTemplateExamples:
+    """Examples of metric prompt templates for model-based evaluation."""
+
+    _PROMPT_TEMPLATE_MAP = {
+        constants.Metric.COHERENCE: _default_templates.COHERENCE_PROMPT_TEMPLATE,
+        constants.Metric.FLUENCY: _default_templates.FLUENCY_PROMPT_TEMPLATE,
+        constants.Metric.SAFETY: _default_templates.SAFETY_PROMPT_TEMPLATE,
+        constants.Metric.GROUNDEDNESS: (
+            _default_templates.GROUNDEDNESS_PROMPT_TEMPLATE
+        ),
+        constants.Metric.INSTRUCTION_FOLLOWING: (
+            _default_templates.INSTRUCTION_FOLLOWING_PROMPT_TEMPLATE
+        ),
+        constants.Metric.VERBOSITY: _default_templates.VERBOSITY_PROMPT_TEMPLATE,
+        constants.Metric.TEXT_QUALITY: (
+            _default_templates.TEXT_QUALITY_PROMPT_TEMPLATE
+        ),
+        constants.Metric.SUMMARIZATION_QUALITY: (
+            _default_templates.SUMMARIZATION_QUALITY_PROMPT_TEMPLATE
+        ),
+        constants.Metric.QUESTION_ANSWERING_QUALITY: (
+            _default_templates.QUESTION_ANSWERING_QUALITY_PROMPT_TEMPLATE
+        ),
+        constants.Metric.MULTI_TURN_CHAT_QUALITY: (
+            _default_templates.MULTI_TURN_CHAT_QUALITY_PROMPT_TEMPLATE
+        ),
+        constants.Metric.MULTI_TURN_SAFETY: (
+            _default_templates.MULTI_TURN_SAFETY_PROMPT_TEMPLATE
+        ),
+        constants.Metric.PAIRWISE_COHERENCE: (
+            _default_templates.PAIRWISE_COHERENCE_PROMPT_TEMPLATE
+        ),
+        constants.Metric.PAIRWISE_FLUENCY: (
+            _default_templates.PAIRWISE_FLUENCY_PROMPT_TEMPLATE
+        ),
+        constants.Metric.PAIRWISE_SAFETY: (
+            _default_templates.PAIRWISE_SAFETY_PROMPT_TEMPLATE
+        ),
+        constants.Metric.PAIRWISE_GROUNDEDNESS: (
+            _default_templates.PAIRWISE_GROUNDEDNESS_PROMPT_TEMPLATE
+        ),
+        constants.Metric.PAIRWISE_INSTRUCTION_FOLLOWING: (
+            _default_templates.PAIRWISE_INSTRUCTION_FOLLOWING_PROMPT_TEMPLATE
+        ),
+        constants.Metric.PAIRWISE_VERBOSITY: (
+            _default_templates.PAIRWISE_VERBOSITY_PROMPT_TEMPLATE
+        ),
+        constants.Metric.PAIRWISE_TEXT_QUALITY: (
+            _default_templates.PAIRWISE_TEXT_QUALITY_PROMPT_TEMPLATE
+        ),
+        constants.Metric.PAIRWISE_SUMMARIZATION_QUALITY: (
+            _default_templates.PAIRWISE_SUMMARIZATION_QUALITY_PROMPT_TEMPLATE
+        ),
+        constants.Metric.PAIRWISE_QUESTION_ANSWERING_QUALITY: (
+            _default_templates.PAIRWISE_QUESTION_ANSWERING_QUALITY_PROMPT_TEMPLATE
+        ),
+        constants.Metric.PAIRWISE_MULTI_TURN_CHAT_QUALITY: (
+            _default_templates.PAIRWISE_MULTI_TURN_CHAT_QUALITY_PROMPT_TEMPLATE
+        ),
+        constants.Metric.PAIRWISE_MULTI_TURN_SAFETY: (
+            _default_templates.PAIRWISE_MULTI_TURN_SAFETY_PROMPT_TEMPLATE
+        ),
+    }
+
+    @classmethod
+    def get_prompt_template(cls, metric_name: str) -> str:
+        """Returns the prompt template for the given metric name."""
+        return cls._PROMPT_TEMPLATE_MAP[metric_name]
+
+    @classmethod
+    def list_example_metric_names(cls) -> List[str]:
+        """Returns a list of all metric prompt templates."""
+        return list(cls._PROMPT_TEMPLATE_MAP.keys())
+
+    class Pointwise:
+        """Example PointwiseMetric instances."""
+
+        FLUENCY = pointwise_metric.PointwiseMetric(
+            metric=constants.Metric.FLUENCY,
+            metric_prompt_template=_default_templates.FLUENCY_PROMPT_TEMPLATE,
+        )
+        COHERENCE = pointwise_metric.PointwiseMetric(
+            metric=constants.Metric.COHERENCE,
+            metric_prompt_template=_default_templates.COHERENCE_PROMPT_TEMPLATE,
+        )
+        SAFETY = pointwise_metric.PointwiseMetric(
+            metric=constants.Metric.SAFETY,
+            metric_prompt_template=_default_templates.SAFETY_PROMPT_TEMPLATE,
+        )
+        GROUNDEDNESS = pointwise_metric.PointwiseMetric(
+            metric=constants.Metric.GROUNDEDNESS,
+            metric_prompt_template=_default_templates.GROUNDEDNESS_PROMPT_TEMPLATE,
+        )
+        INSTRUCTION_FOLLOWING = pointwise_metric.PointwiseMetric(
+            metric=constants.Metric.INSTRUCTION_FOLLOWING,
+            metric_prompt_template=_default_templates.INSTRUCTION_FOLLOWING_PROMPT_TEMPLATE,
+        )
+        VERBOSITY = pointwise_metric.PointwiseMetric(
+            metric=constants.Metric.VERBOSITY,
+            metric_prompt_template=_default_templates.VERBOSITY_PROMPT_TEMPLATE,
+        )
+        TEXT_QUALITY = pointwise_metric.PointwiseMetric(
+            metric=constants.Metric.TEXT_QUALITY,
+            metric_prompt_template=_default_templates.TEXT_QUALITY_PROMPT_TEMPLATE,
+        )
+        SUMMARIZATION_QUALITY = pointwise_metric.PointwiseMetric(
+            metric=constants.Metric.SUMMARIZATION_QUALITY,
+            metric_prompt_template=_default_templates.SUMMARIZATION_QUALITY_PROMPT_TEMPLATE,
+        )
+        QUESTION_ANSWERING_QUALITY = pointwise_metric.PointwiseMetric(
+            metric=constants.Metric.QUESTION_ANSWERING_QUALITY,
+            metric_prompt_template=_default_templates.QUESTION_ANSWERING_QUALITY_PROMPT_TEMPLATE,
+        )
+        MULTI_TURN_CHAT_QUALITY = pointwise_metric.PointwiseMetric(
+            metric=constants.Metric.MULTI_TURN_CHAT_QUALITY,
+            metric_prompt_template=_default_templates.MULTI_TURN_CHAT_QUALITY_PROMPT_TEMPLATE,
+        )
+        MULTI_TURN_SAFETY_QUALITY = pointwise_metric.PointwiseMetric(
+            metric=constants.Metric.MULTI_TURN_SAFETY,
+            metric_prompt_template=_default_templates.MULTI_TURN_SAFETY_PROMPT_TEMPLATE,
+        )
+
+    class Pairwise:
+        """Example PairwiseMetric instances."""
+
+        FLUENCY = pairwise_metric.PairwiseMetric(
+            metric=constants.Metric.PAIRWISE_FLUENCY,
+            metric_prompt_template=_default_templates.PAIRWISE_FLUENCY_PROMPT_TEMPLATE,
+        )
+        COHERENCE = pairwise_metric.PairwiseMetric(
+            metric=constants.Metric.PAIRWISE_COHERENCE,
+            metric_prompt_template=_default_templates.PAIRWISE_COHERENCE_PROMPT_TEMPLATE,
+        )
+        SAFETY = pairwise_metric.PairwiseMetric(
+            metric=constants.Metric.PAIRWISE_SAFETY,
+            metric_prompt_template=_default_templates.PAIRWISE_SAFETY_PROMPT_TEMPLATE,
+        )
+        GROUNDEDNESS = pairwise_metric.PairwiseMetric(
+            metric=constants.Metric.PAIRWISE_GROUNDEDNESS,
+            metric_prompt_template=_default_templates.PAIRWISE_GROUNDEDNESS_PROMPT_TEMPLATE,
+        )
+        INSTRUCTION_FOLLOWING = pairwise_metric.PairwiseMetric(
+            metric=constants.Metric.PAIRWISE_INSTRUCTION_FOLLOWING,
+            metric_prompt_template=_default_templates.PAIRWISE_INSTRUCTION_FOLLOWING_PROMPT_TEMPLATE,
+        )
+        VERBOSITY = pairwise_metric.PairwiseMetric(
+            metric=constants.Metric.PAIRWISE_VERBOSITY,
+            metric_prompt_template=_default_templates.PAIRWISE_VERBOSITY_PROMPT_TEMPLATE,
+        )
+        TEXT_QUALITY = pairwise_metric.PairwiseMetric(
+            metric=constants.Metric.PAIRWISE_TEXT_QUALITY,
+            metric_prompt_template=_default_templates.PAIRWISE_TEXT_QUALITY_PROMPT_TEMPLATE,
+        )
+        SUMMARIZATION_QUALITY = pairwise_metric.PairwiseMetric(
+            metric=constants.Metric.PAIRWISE_SUMMARIZATION_QUALITY,
+            metric_prompt_template=_default_templates.PAIRWISE_SUMMARIZATION_QUALITY_PROMPT_TEMPLATE,
+        )
+        QUESTION_ANSWERING_QUALITY = pairwise_metric.PairwiseMetric(
+            metric=constants.Metric.PAIRWISE_QUESTION_ANSWERING_QUALITY,
+            metric_prompt_template=_default_templates.PAIRWISE_QUESTION_ANSWERING_QUALITY_PROMPT_TEMPLATE,
+        )
+        MULTI_TURN_CHAT_QUALITY = pairwise_metric.PairwiseMetric(
+            metric=constants.Metric.PAIRWISE_MULTI_TURN_CHAT_QUALITY,
+            metric_prompt_template=_default_templates.PAIRWISE_MULTI_TURN_CHAT_QUALITY_PROMPT_TEMPLATE,
+        )
+        MULTI_TURN_SAFETY_QUALITY = pairwise_metric.PairwiseMetric(
+            metric=constants.Metric.PAIRWISE_MULTI_TURN_SAFETY,
+            metric_prompt_template=_default_templates.PAIRWISE_MULTI_TURN_SAFETY_PROMPT_TEMPLATE,
+        )
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pairwise_metric.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pairwise_metric.py
@@ -0,0 +1,133 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Model-based Pairwise Metric."""
+
+from typing import Callable, Optional, Union
+
+from google.cloud.aiplatform_v1beta1.types import (
+    evaluation_service as gapic_eval_service_types,
+)
+from vertexai.preview import generative_models
+from vertexai.preview.evaluation.metrics import _base
+from vertexai.preview.evaluation.metrics import (
+    custom_output_config as custom_output_config_class,
+)
+from vertexai.preview.evaluation.metrics import (
+    metric_prompt_template as metric_prompt_template_base,
+)
+
+
+class PairwiseMetric(_base._ModelBasedMetric):  # pylint: disable=protected-access
+    """A Model-based Pairwise Metric.
+
+    A model-based evaluation metric that compares two generative models' responses
+    side-by-side, and allows users to A/B test their generative models to
+    determine which model is performing better.
+
+    For more details on when to use pairwise metrics, see
+    [Evaluation methods and
+    metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#pointwise_versus_pairwise).
+
+    Result Details:
+
+        * In `EvalResult.summary_metrics`, win rates for both the baseline and
+        candidate model are computed. The win rate is computed as proportion of
+        wins of one model's responses to total attempts as a decimal value
+        between 0 and 1.
+
+        * In `EvalResult.metrics_table`, a pairwise metric produces two
+        evaluation results per dataset row:
+            * `pairwise_choice`: The choice shows whether the candidate model or
+              the baseline model performs better, or if they are equally good.
+            * `explanation`: The rationale behind each verdict using
+              chain-of-thought reasoning. The explanation helps users scrutinize
+              the judgment and builds appropriate trust in the decisions.
+
+        See [documentation
+        page](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#understand-results)
+        for more details on understanding the metric results.
+
+    Usage Examples:
+
+        ```
+        baseline_model = GenerativeModel("gemini-1.0-pro")
+        candidate_model = GenerativeModel("gemini-1.5-pro")
+
+        pairwise_groundedness = PairwiseMetric(
+            metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template(
+                "pairwise_groundedness"
+            ),
+            baseline_model=baseline_model,
+        )
+        eval_dataset = pd.DataFrame({
+              "prompt"  : [...],
+        })
+        pairwise_task = EvalTask(
+            dataset=eval_dataset,
+            metrics=[pairwise_groundedness],
+            experiment="my-pairwise-experiment",
+        )
+        pairwise_result = pairwise_task.evaluate(
+            model=candidate_model,
+            experiment_run_name="gemini-pairwise-eval-run",
+        )
+        ```
+    """
+
+    def __init__(
+        self,
+        *,
+        metric: str,
+        metric_prompt_template: Union[
+            metric_prompt_template_base.PairwiseMetricPromptTemplate, str
+        ],
+        baseline_model: Optional[
+            Union[generative_models.GenerativeModel, Callable[[str], str]]
+        ] = None,
+        system_instruction: Optional[str] = None,
+        autorater_config: Optional[gapic_eval_service_types.AutoraterConfig] = None,
+        custom_output_config: Optional[
+            custom_output_config_class.CustomOutputConfig
+        ] = None,
+    ):
+        """Initializes a pairwise evaluation metric.
+
+        Args:
+          metric: The pairwise evaluation metric name.
+          metric_prompt_template: Pairwise metric prompt template for performing
+            the pairwise model-based evaluation. A freeform string is also accepted.
+          baseline_model: The baseline model for side-by-side comparison. If not
+            specified, `baseline_model_response` column is required in the dataset
+            to perform bring-your-own-response(BYOR) evaluation.
+          system_instruction: The system instruction for the evaluation.
+          autorater_config: The config for judge model.
+          custom_output_config: Config for custom output from the judge model.
+        """
+        super().__init__(
+            metric_prompt_template=metric_prompt_template,
+            metric=metric,
+            system_instruction=system_instruction,
+            autorater_config=autorater_config,
+            custom_output_config=custom_output_config,
+        )
+        self._baseline_model = baseline_model
+
+    @property
+    def baseline_model(
+        self,
+    ) -> Union[generative_models.GenerativeModel, Callable[[str], str]]:
+        return self._baseline_model
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pointwise_metric.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/pointwise_metric.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Model-based Pointwise Metric."""
+
+from typing import Optional, Union
+
+from google.cloud.aiplatform_v1beta1.types import (
+    evaluation_service as gapic_eval_service_types,
+)
+from vertexai.preview.evaluation.metrics import _base
+from vertexai.preview.evaluation.metrics import (
+    custom_output_config as custom_output_config_class,
+)
+from vertexai.preview.evaluation.metrics import (
+    metric_prompt_template as metric_prompt_template_base,
+)
+
+
+class PointwiseMetric(_base._ModelBasedMetric):  # pylint: disable=protected-access
+    """A Model-based Pointwise Metric.
+
+    A model-based evaluation metric that evaluate a single generative model's
+    response.
+
+    For more details on when to use model-based pointwise metrics, see
+    [Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval).
+
+    Usage Examples:
+
+        ```
+        candidate_model = GenerativeModel("gemini-1.5-pro")
+        eval_dataset = pd.DataFrame({
+            "prompt"  : [...],
+        })
+        fluency_metric = PointwiseMetric(
+            metric="fluency",
+            metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template('fluency'),
+        )
+        pointwise_eval_task = EvalTask(
+            dataset=eval_dataset,
+            metrics=[
+                fluency_metric,
+                MetricPromptTemplateExamples.Pointwise.GROUNDEDNESS,
+            ],
+        )
+        pointwise_result = pointwise_eval_task.evaluate(
+            model=candidate_model,
+        )
+        ```
+    """
+
+    def __init__(
+        self,
+        *,
+        metric: str,
+        metric_prompt_template: Union[
+            metric_prompt_template_base.PointwiseMetricPromptTemplate, str
+        ],
+        system_instruction: Optional[str] = None,
+        autorater_config: Optional[gapic_eval_service_types.AutoraterConfig] = None,
+        custom_output_config: Optional[
+            custom_output_config_class.CustomOutputConfig
+        ] = None,
+    ):
+        """Initializes a pointwise evaluation metric.
+
+        Args:
+          metric: The pointwise evaluation metric name.
+          metric_prompt_template: Pointwise metric prompt template for performing
+            the model-based evaluation. A freeform string is also accepted.
+          system_instruction: The system instruction for the evaluation.
+          autorater_config: The config for judge model.
+          custom_output_config: Config for custom output from the judge model.
+        """
+        super().__init__(
+            metric_prompt_template=metric_prompt_template,
+            metric=metric,
+            system_instruction=system_instruction,
+            autorater_config=autorater_config,
+            custom_output_config=custom_output_config,
+        )
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/predefined_rubric_metrics.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/predefined_rubric_metrics.py
@@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from google.cloud.aiplatform_v1beta1.types import (
+    evaluation_service as gapic_eval_service_types,
+)
+from vertexai.preview.evaluation import utils
+from vertexai.preview.evaluation.metrics import (
+    _base as metrics_base,
+)
+from vertexai.preview.evaluation.metrics import (
+    _default_templates,
+)
+from vertexai.preview.evaluation.metrics import (
+    custom_output_config,
+)
+from vertexai.preview.evaluation.metrics import pairwise_metric
+from vertexai.preview.evaluation.metrics import pointwise_metric
+from vertexai.preview.evaluation.metrics import (
+    rubric_based_metric,
+)
+
+
+AutoraterConfig = gapic_eval_service_types.AutoraterConfig
+
+_POINTWISE_OUTPUT_CONFIG = custom_output_config.CustomOutputConfig(
+    return_raw_output=True,
+    parsing_fn=utils.parse_pointwise_rubric_result,
+)
+
+_PAIRWISE_OUTPUT_CONFIG = custom_output_config.CustomOutputConfig(
+    return_raw_output=True,
+    parsing_fn=utils.parse_pairwise_rubric_result,
+)
+_PAIRWISE_AUTORATER_CONFIG = AutoraterConfig(
+    sampling_count=1,
+)
+
+
+class PredefinedRubricMetrics:
+    """Predefined rubric-based metrics."""
+
+    class Pointwise:
+        """Pointwise rubric-based metrics."""
+
+        INSTRUCTION_FOLLOWING = rubric_based_metric.RubricBasedMetric(
+            generation_config=metrics_base.RubricGenerationConfig(
+                prompt_template=_default_templates.INSTRUCTION_FOLLOWING_RUBRIC_GENERATION_PROMPT_TEMPLATE,
+            ),
+            critique_metric=pointwise_metric.PointwiseMetric(
+                metric="rb_instruction_following",
+                metric_prompt_template=_default_templates.INSTRUCTION_FOLLOWING_RUBRIC_CRITIQUE_TEMPLATE,
+                custom_output_config=_POINTWISE_OUTPUT_CONFIG,
+            ),
+        )
+        MULTIMODAL_UNDERSTANDING = rubric_based_metric.RubricBasedMetric(
+            generation_config=metrics_base.RubricGenerationConfig(
+                prompt_template=_default_templates.MULTIMODAL_UNDERSTANDING_RUBRIC_GENERATION_PROMPT_TEMPLATE
+            ),
+            critique_metric=pointwise_metric.PointwiseMetric(
+                metric="rb_multimodal_understanding",
+                metric_prompt_template=_default_templates.MULTIMODAL_UNDERSTANDING_RUBRIC_CRITIQUE_TEMPLATE,
+                custom_output_config=_POINTWISE_OUTPUT_CONFIG,
+            ),
+        )
+        TEXT_QUALITY = rubric_based_metric.RubricBasedMetric(
+            generation_config=metrics_base.RubricGenerationConfig(
+                prompt_template=_default_templates.TEXT_QUALITY_RUBRIC_GENERATION_PROMPT_TEMPLATE
+            ),
+            critique_metric=pointwise_metric.PointwiseMetric(
+                metric="rb_text_quality",
+                metric_prompt_template=_default_templates.TEXT_QUALITY_RUBRIC_CRITIQUE_TEMPLATE,
+                custom_output_config=_POINTWISE_OUTPUT_CONFIG,
+            ),
+        )
+
+    class Pairwise:
+        """Pairwise rubric-based metrics."""
+
+        INSTRUCTION_FOLLOWING = rubric_based_metric.RubricBasedMetric(
+            generation_config=metrics_base.RubricGenerationConfig(
+                prompt_template=_default_templates.INSTRUCTION_FOLLOWING_RUBRIC_GENERATION_PROMPT_TEMPLATE,
+            ),
+            critique_metric=pairwise_metric.PairwiseMetric(
+                metric="pairwise_rb_instruction_following",
+                metric_prompt_template=_default_templates.PAIRWISE_INSTRUCTION_FOLLOWING_RUBRIC_CRITIQUE_TEMPLATE,
+                custom_output_config=_PAIRWISE_OUTPUT_CONFIG,
+                autorater_config=_PAIRWISE_AUTORATER_CONFIG,
+            ),
+        )
+        MULTIMODAL_UNDERSTANDING = rubric_based_metric.RubricBasedMetric(
+            generation_config=metrics_base.RubricGenerationConfig(
+                prompt_template=_default_templates.MULTIMODAL_UNDERSTANDING_RUBRIC_GENERATION_PROMPT_TEMPLATE
+            ),
+            critique_metric=pairwise_metric.PairwiseMetric(
+                metric="pairwise_rb_multimodal_understanding",
+                metric_prompt_template=_default_templates.PAIRWISE_MULTIMODAL_UNDERSTANDING_RUBRIC_CRITIQUE_TEMPLATE,
+                custom_output_config=_PAIRWISE_OUTPUT_CONFIG,
+                autorater_config=_PAIRWISE_AUTORATER_CONFIG,
+            ),
+        )
+        TEXT_QUALITY = rubric_based_metric.RubricBasedMetric(
+            generation_config=metrics_base.RubricGenerationConfig(
+                prompt_template=_default_templates.TEXT_QUALITY_RUBRIC_GENERATION_PROMPT_TEMPLATE
+            ),
+            critique_metric=pairwise_metric.PairwiseMetric(
+                metric="pairwise_rb_text_quality",
+                metric_prompt_template=_default_templates.PAIRWISE_TEXT_QUALITY_RUBRIC_CRITIQUE_TEMPLATE,
+                custom_output_config=_PAIRWISE_OUTPUT_CONFIG,
+                autorater_config=_PAIRWISE_AUTORATER_CONFIG,
+            ),
+        )
--- a/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/rubric_based_metric.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/metrics/rubric_based_metric.py
@@ -0,0 +1,104 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import collections
+from typing import Union, TYPE_CHECKING
+
+from google.cloud.aiplatform import base
+from vertexai import generative_models
+from vertexai.preview.evaluation import _pre_eval_utils
+from vertexai.preview.evaluation import constants
+from vertexai.preview.evaluation import utils
+from vertexai.preview.evaluation.metrics import (
+    _base as metrics_base,
+)
+from vertexai.preview.evaluation.metrics import pairwise_metric
+from vertexai.preview.evaluation.metrics import pointwise_metric
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+_DEFAULT_MODEL_NAME = "gemini-2.0-flash-001"
+_LOGGER = base.Logger(__name__)
+
+
+class RubricBasedMetric(metrics_base._Metric):
+    """Config for Rubric-Based Eval."""
+
+    def __init__(
+        self,
+        *,
+        generation_config: metrics_base.RubricGenerationConfig,
+        critique_metric: Union[
+            pointwise_metric.PointwiseMetric, pairwise_metric.PairwiseMetric
+        ]
+    ):
+        """Initializes RubricBasedMetric.
+
+        Args:
+          generation_config: Config for rubric generation.
+          critique_metric: Pointwise/pairwise metric for rubric critique.
+        """
+        super().__init__(metric=critique_metric._metric)
+
+        self.generation_config = generation_config
+        self.critique_metric = critique_metric
+
+    def generate_rubrics(
+        self,
+        eval_dataset: "pd.Dataframe",
+    ) -> "pd.DataFrame":
+        """Generates rubrics for given eval dataset."""
+        if not self.generation_config.model:
+            model = generative_models.GenerativeModel(model_name=_DEFAULT_MODEL_NAME)
+        else:
+            model = self.generation_config.model
+
+        if constants.Dataset.RUBRICS_COLUMN in eval_dataset.columns:
+            _LOGGER.warning(
+                "Rubrics column already exists in the dataset. Skipping rubric"
+                " generation."
+            )
+            return eval_dataset
+
+        responses = _pre_eval_utils._generate_responses_from_gemini_model(
+            model,
+            eval_dataset,
+            self.generation_config.prompt_template,
+        )
+        if self.generation_config.parsing_fn:
+            parsing_fn = self.generation_config.parsing_fn
+        else:
+            parsing_fn = utils.parse_rubrics
+        dataset_with_rubrics = eval_dataset.copy()
+        aggregated = collections.defaultdict(list)
+        for idx, response in enumerate(responses):
+            result = parsing_fn(response)
+            if isinstance(result, dict):
+                questions = result.pop("questions", None)
+                if questions is not None:
+                    aggregated[constants.Dataset.RUBRICS_COLUMN].append(
+                        (idx, questions)
+                    )
+                for key, value in result.items():
+                    aggregated[key].append((idx, value))
+            else:
+                aggregated[constants.Dataset.RUBRICS_COLUMN].append((idx, result))
+        for key, values in aggregated.items():
+            dataset_with_rubrics[key] = None
+            dataset_with_rubrics[key] = dataset_with_rubrics[key].astype(object)
+            for idx, value in values:
+                dataset_with_rubrics.at[idx, key] = value
+        return dataset_with_rubrics