structure saas with tools

2025-04-25 15:30:54 -03:00
commit 1aef473937
16434 changed files with 6584257 additions and 0 deletions
--- a/.venv/lib/python3.10/site-packages/vertexai/evaluation/_evaluation.py
+++ b/.venv/lib/python3.10/site-packages/vertexai/evaluation/_evaluation.py
@@ -0,0 +1,980 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Evaluation Orchestration Library."""
+
+import collections
+from concurrent import futures
+import copy
+import time
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING, Union
+
+from google.cloud.aiplatform import base
+from google.cloud.aiplatform_v1beta1.types import (
+    content as gapic_content_types,
+)
+from vertexai import generative_models
+from vertexai.evaluation import _base as evaluation_base
+from vertexai.evaluation import constants
+from vertexai.evaluation import (
+    prompt_template as prompt_template_base,
+)
+from vertexai.evaluation import utils
+from vertexai.evaluation.metrics import (
+    _base as metrics_base,
+)
+from vertexai.evaluation.metrics import (
+    _instance_evaluation,
+)
+from vertexai.evaluation.metrics import (
+    metric_prompt_template_examples,
+)
+from vertexai.evaluation.metrics import pairwise_metric
+from vertexai.evaluation.metrics import pointwise_metric
+
+
+try:
+    from tqdm import tqdm
+except ImportError:
+    raise ImportError(
+        'tqdm is not installed. Please install the SDK using "pip install'
+        ' google-cloud-aiplatform[evaluation]"'
+    )
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+_LOGGER = base.Logger(__name__)
+_SUCCESSFUL_FINISH_REASONS = [
+    gapic_content_types.Candidate.FinishReason.STOP,
+    gapic_content_types.Candidate.FinishReason.MAX_TOKENS,
+    # Many responses have this finish reason
+    gapic_content_types.Candidate.FinishReason.FINISH_REASON_UNSPECIFIED,
+]
+
+
+def _validate_metrics(metrics: List[Union[str, metrics_base._Metric]]) -> None:
+    """Validates the metrics list.
+
+    Args:
+      metrics: The list of metric names, or Metric instances to
+        evaluate.
+
+    Raises:
+      ValueError: If metric is empty or if multiple metrics of the
+        same metric name are found.
+    """
+    if not metrics:
+        raise ValueError("Metrics cannot be empty.")
+
+    seen_strings = set()
+    seen_metric_names = set()
+
+    for metric in metrics:
+        if isinstance(metric, str):
+            if metric in seen_strings:
+                raise ValueError(f"Duplicate string metric name found: '{metric}'")
+            seen_strings.add(metric)
+        elif isinstance(metric, metrics_base._Metric):
+            if metric.metric_name in seen_metric_names:
+                raise ValueError(
+                    "Duplicate Metric instances of the same metric name found: "
+                    f"'{metric.metric_name}'"
+                )
+            seen_metric_names.add(metric.metric_name)
+
+
+def _validate_metric_column_map(
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+):
+    """Validates the column map for metric prompt template usage."""
+    for metric in evaluation_run_config.metrics:
+        if isinstance(
+            metric, metrics_base._ModelBasedMetric  # pylint: disable=protected-access
+        ):
+            for variable in prompt_template_base.PromptTemplate(
+                metric.metric_prompt_template
+            ).variables:
+                if (
+                    evaluation_run_config.metric_column_mapping.get(variable, "")
+                    not in evaluation_run_config.dataset.columns
+                ):
+                    raise ValueError(
+                        f"Cannot find the `{variable}` column in the evaluation"
+                        " dataset to fill the metric prompt template for"
+                        f" `{str(metric)}` metric. Please check if the column is"
+                        " present in the evaluation dataset, or provide a"
+                        " key-value pair in `metric_column_mapping` parameter"
+                        " of `EvalTask` to map it to a different column name."
+                        " The evaluation dataset columns are"
+                        f" {list(evaluation_run_config.dataset.columns)}."
+                    )
+
+
+def _validate_dataset(
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+) -> None:
+    """Validates the required columns exists in the dataset."""
+    _validate_response_column_required(evaluation_run_config)
+    _validate_reference_column_required(evaluation_run_config)
+    _validate_reference_or_source_column_required(evaluation_run_config)
+
+
+def _validate_response_column_required(
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+) -> None:
+    """Validates the response column exists in the dataset."""
+    for metric in evaluation_run_config.metrics:
+        if metric in constants.Metric.AUTOMATIC_METRIC_LIST or isinstance(
+            metric, metrics_base._TranslationMetric  # pylint: disable=protected-access
+        ):
+            _validate_column_provided(
+                evaluation_run_config,
+                constants.Dataset.MODEL_RESPONSE_COLUMN,
+            )
+
+
+def _validate_reference_column_required(
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+) -> None:
+    """Validates the reference column exists in the dataset."""
+    if set(evaluation_run_config.metrics).intersection(
+        set(constants.Metric.AUTOMATIC_METRIC_LIST)
+    ):
+        _validate_column_provided(
+            evaluation_run_config,
+            constants.Dataset.REFERENCE_COLUMN,
+        )
+
+
+def _validate_column_provided(
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+    column_name: str,
+) -> None:
+    """Validates the required column exist in the dataset."""
+    if column_name not in evaluation_run_config.metric_column_mapping:
+        evaluation_run_config.metric_column_mapping[column_name] = column_name
+    evaluation_run_config.validate_dataset_column(column_name)
+
+
+def _validate_reference_or_source_column_required(
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+) -> None:
+    """Validates one of reference or source columns exist in the dataset."""
+    for metric in evaluation_run_config.metrics:
+        if isinstance(
+            metric, metrics_base._TranslationMetric  # pylint: disable=protected-access
+        ):
+            # Validate the reference column.
+            # This is optional if source column is provided.
+            try:
+                _validate_column_provided(
+                    evaluation_run_config,
+                    constants.Dataset.REFERENCE_COLUMN,
+                )
+            except KeyError:
+                # Reference column is optional. Checking for source column.
+                _validate_column_provided(
+                    evaluation_run_config,
+                    constants.Dataset.SOURCE_COLUMN,
+                )
+
+
+def _compute_custom_metrics(
+    row_dict: Dict[str, Any],
+    custom_metrics: List[metrics_base.CustomMetric],
+    pbar: tqdm,
+    executor: futures.ThreadPoolExecutor,
+) -> Dict[str, Any]:
+    """Computes custom metrics for a row.
+
+    Args:
+        row_dict: A dictionary of an instance in the eval dataset.
+        custom_metrics: A list of CustomMetrics.
+        pbar: A tqdm progress bar.
+        executor: A thread pool executor.
+
+    Returns:
+        A dictionary of an instance containing custom metric results.
+
+    Raises:
+        KeyError: If the custom metric function does not return a valid output.
+    """
+    futures_by_metric = collections.defaultdict(list)
+    for custom_metric in custom_metrics:
+        future = executor.submit(custom_metric.metric_function, row_dict)
+        future.add_done_callback(lambda _: pbar.update(1))
+        futures_by_metric[custom_metric].append(future)
+
+    for custom_metric, futures_list in futures_by_metric.items():
+        for future in futures_list:
+            metric_output = future.result()
+            try:
+                row_dict[
+                    f"{custom_metric.name}/{constants.MetricResult.SCORE_KEY}"
+                ] = metric_output[custom_metric.name]
+            except KeyError:
+                raise KeyError(
+                    f"Custom metric score `{custom_metric.name}` not found in"
+                    f" the metric output {metric_output}. Please make sure the"
+                    " custom metric function is valid, and the output"
+                    f" dictionary uses `{custom_metric.name}` as the key for"
+                    " metric score."
+                )
+            # Include additional metric results like explanation.
+            for key, value in metric_output.items():
+                if key != custom_metric.name:
+                    row_dict[f"{custom_metric.name}/{key}"] = value
+    return row_dict
+
+
+def _separate_custom_metrics(
+    metrics: List[Union[str, metrics_base._Metric]],
+) -> Tuple[List[Union[str, metrics_base._Metric]], List[metrics_base.CustomMetric],]:
+    """Separates the metrics list into API and custom metrics."""
+    custom_metrics = []
+    api_metrics = []
+    for metric in metrics:
+        if isinstance(metric, metrics_base.CustomMetric):
+            custom_metrics.append(metric)
+        else:
+            api_metrics.append(metric)
+    return api_metrics, custom_metrics
+
+
+def _aggregate_summary_metrics(
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+    metrics_table: "pd.DataFrame",
+) -> Dict[str, Any]:
+    """Computes summary metrics.
+
+    Args:
+        evaluation_run_config: Evaluation Run Configurations.
+        metrics_table: A dataframe containing per-instance metrics results.
+
+    Returns:
+        A dictionary containing summary metrics results and statistics.
+    """
+    summary_metrics = {}
+    summary_metrics[constants.MetricResult.ROW_COUNT_KEY] = metrics_table.shape[0]
+
+    for metric in evaluation_run_config.metrics:
+        try:
+            if isinstance(metric, pairwise_metric.PairwiseMetric):
+                summary_metrics[f"{metric.metric_name}/candidate_model_win_rate"] = (
+                    metrics_table[
+                        f"{metric.metric_name}/{constants.MetricResult.PAIRWISE_CHOICE_KEY}"
+                    ]
+                    == "CANDIDATE"
+                ).mean()
+                summary_metrics[f"{metric.metric_name}/baseline_model_win_rate"] = (
+                    metrics_table[
+                        f"{metric.metric_name}/{constants.MetricResult.PAIRWISE_CHOICE_KEY}"
+                    ]
+                    == "BASELINE"
+                ).mean()
+            else:
+                summary_metrics[f"{str(metric)}/mean"] = metrics_table.loc[
+                    :, f"{str(metric)}/{constants.MetricResult.SCORE_KEY}"
+                ].mean()
+                summary_metrics[f"{str(metric)}/std"] = metrics_table.loc[
+                    :, f"{str(metric)}/{constants.MetricResult.SCORE_KEY}"
+                ].std()
+        except (ValueError, KeyError) as e:
+            _LOGGER.warning(
+                f"Failed to compute metric statistics for `{metric}` metric."
+                f"{type(e).__name__}: {e}"
+            )
+            continue
+    return summary_metrics
+
+
+def _generate_content_text_response(
+    model: generative_models.GenerativeModel, prompt: str, max_retries: int = 3
+) -> str:
+    """Generates a text response from Gemini model from a text prompt with retries.
+
+    Args:
+        model: The Gemini model instance.
+        prompt: The prompt to send to the model.
+        max_retries: Maximum number of retries for response generation.
+
+    Returns:
+        The text response from the model.
+        Returns constants.RESPONSE_ERROR if there is an error after all retries.
+    """
+    for retry_attempt in range(max_retries):
+        try:
+            response = model.generate_content(prompt)
+            if not response.candidates:
+                error_message = (
+                    f"The model response was blocked due to"
+                    f" {response._raw_response.prompt_feedback.block_reason.name}.\n"
+                    f"Blocked reason message:"
+                    f" {response._raw_response.prompt_feedback.block_reason_message}.\n"
+                    "The input prompt may be blocked for safety reasons.\n"
+                    f"Prompt: {prompt}.\n"
+                    f"Retry attempt: {retry_attempt + 1}/{max_retries}"
+                )
+                _LOGGER.warning(error_message)
+                break
+            else:
+                candidate = response.candidates[0]
+                if candidate.finish_reason not in _SUCCESSFUL_FINISH_REASONS:
+                    error_message = (
+                        "The model response did not finish"
+                        " successfully.\n"
+                        f"Finish reason: {candidate.finish_reason}.\n"
+                        f"Finish message: {candidate.finish_message}.\n"
+                        f"Safety ratings: {candidate.safety_ratings}.\n"
+                        "Please adjust the model safety_settings, or"
+                        " try a different prompt.\n"
+                        f"Retry attempt: {retry_attempt + 1}/{max_retries}"
+                    )
+                    _LOGGER.warning(error_message)
+                else:
+                    return response.candidates[0].content.parts[0].text
+        except Exception as e:
+            error_message = (
+                f"Failed to generate response candidates from Gemini model"
+                f" {model._model_name}.\n"
+                f"Error: {e}.\n"
+                f"Prompt: {prompt}.\n"
+                f"Retry attempt: {retry_attempt + 1}/{max_retries}"
+            )
+            _LOGGER.warning(error_message)
+        if retry_attempt < max_retries - 1:
+            _LOGGER.info(
+                f"Retrying response generation for prompt: {prompt}, attempt"
+                f" {retry_attempt + 1}/{max_retries}..."
+            )
+
+    final_error_message = (
+        f"Failed to generate response from Gemini model {model._model_name}.\n"
+        f"Prompt: {prompt}."
+    )
+    _LOGGER.warning(final_error_message)
+    return constants.RESPONSE_ERROR
+
+
+def _generate_responses_from_gemini_model(
+    model: generative_models.GenerativeModel,
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+    is_baseline_model: bool = False,
+) -> None:
+    """Generates responses from Gemini model.
+
+    Args:
+        model: The Gemini model instance.
+        evaluation_run_config: Evaluation Run Configurations.
+        is_baseline_model: Whether the model is a baseline model for PairwiseMetric.
+    """
+    # Ensure thread safety and avoid race conditions.
+    df = evaluation_run_config.dataset.copy()
+
+    _LOGGER.info(
+        f"Generating a total of {evaluation_run_config.dataset.shape[0]} "
+        f"responses from Gemini model {model._model_name.split('/')[-1]}."
+    )
+    tasks = []
+    with tqdm(total=len(df)) as pbar:
+        with futures.ThreadPoolExecutor(max_workers=constants.MAX_WORKERS) as executor:
+            for _, row in df.iterrows():
+                task = executor.submit(
+                    _generate_content_text_response,
+                    prompt=row[constants.Dataset.PROMPT_COLUMN],
+                    model=model,
+                )
+                task.add_done_callback(lambda _: pbar.update(1))
+                tasks.append(task)
+        responses = [future.result() for future in tasks]
+    if is_baseline_model:
+        evaluation_run_config.dataset = df.assign(baseline_model_response=responses)
+    else:
+        evaluation_run_config.dataset = df.assign(response=responses)
+
+    _LOGGER.info(
+        f"All {evaluation_run_config.dataset.shape[0]} responses are successfully"
+        f" generated from Gemini model {model._model_name.split('/')[-1]}."
+    )
+
+
+def _generate_response_from_custom_model_fn(
+    model_fn: Callable[[str], str],
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+    is_baseline_model: bool = False,
+) -> None:
+    """Generates responses from a custom model function.
+
+    Args:
+        model_fn: The custom model function.
+        evaluation_run_config: Evaluation Run Configurations.
+        is_baseline_model: Whether the model is a baseline model for
+          PairwiseMetric.
+    """
+    eval_dataset = evaluation_run_config.dataset.copy()
+    max_workers = 5
+
+    _LOGGER.info(
+        f"Generating a total of {evaluation_run_config.dataset.shape[0]} "
+        "responses from the custom model function."
+    )
+    tasks = []
+    try:
+        with tqdm(total=len(eval_dataset)) as pbar:
+            with futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+                for _, row in eval_dataset.iterrows():
+                    task = executor.submit(
+                        model_fn, row[constants.Dataset.PROMPT_COLUMN]
+                    )
+                    task.add_done_callback(lambda _: pbar.update(1))
+                    tasks.append(task)
+    except (ValueError, IndexError) as e:
+        _LOGGER.warning(f"Failed to generate response from model function: {e}")
+
+    responses = [task.result() for task in tasks]
+    if is_baseline_model:
+        evaluation_run_config.dataset = eval_dataset.assign(
+            baseline_model_response=responses
+        )
+    else:
+        evaluation_run_config.dataset = eval_dataset.assign(response=responses)
+
+    _LOGGER.info(
+        f"All {evaluation_run_config.dataset.shape[0]} responses are successfully"
+        " generated from the custom model function."
+    )
+
+
+def _run_model_inference(
+    model: Union[generative_models.GenerativeModel, Callable[[str], str]],
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+    response_column_name: str = constants.Dataset.MODEL_RESPONSE_COLUMN,
+) -> None:
+    """Runs model inference on dataset for evaluation.
+
+    Args:
+      model: The model or baseline model or a custom model function to
+        generate responses to evaluate.
+      evaluation_run_config: Evaluation Run Configurations.
+      response_column_name: Column name key in metric_column_mapping. Value is
+        constants.Dataset.MODEL_RESPONSE_COLUMN or
+        constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN.
+
+    Raises:
+        ValueError: If the model or baseline model is not supported.
+    """
+    is_baseline_model = (
+        response_column_name == constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN
+    )
+    if model:
+        if response_column_name not in evaluation_run_config.metric_column_mapping:
+            if constants.Dataset.PROMPT_COLUMN in evaluation_run_config.dataset.columns:
+                t1 = time.perf_counter()
+                if isinstance(model, generative_models.GenerativeModel):
+                    _generate_responses_from_gemini_model(
+                        model, evaluation_run_config, is_baseline_model
+                    )
+                elif callable(model):
+                    _generate_response_from_custom_model_fn(
+                        model, evaluation_run_config, is_baseline_model
+                    )
+                else:
+                    raise ValueError(
+                        f"Unsupported model or baseline model type: {type(model)}"
+                    )
+                t2 = time.perf_counter()
+                _LOGGER.info(f"Multithreaded Batch Inference took: {t2 - t1} seconds.")
+                evaluation_run_config.metric_column_mapping[
+                    response_column_name
+                ] = response_column_name
+            else:
+                raise ValueError(
+                    "Missing required input `prompt` column to start model inference."
+                    " Please provide a `prompt_template` parameter in"
+                    " `EvalTask.evaluate()` function if you want to assemble a"
+                    " `prompt` column with variables from the dataset, or provide a"
+                    " `prompt` column in dataset to directly use as input to"
+                    " the model. Mappings in `metric_column_mapping` do not"
+                    " apply for model inference and are used for evaluation only."
+                )
+        else:
+            raise ValueError(
+                "The `model` parameter or `baseline_model` in pairwise metric is"
+                " specified, but the evaluation `dataset` contains model response"
+                " column or baseline model response column"
+                f" `{evaluation_run_config.metric_column_mapping[response_column_name]}`"
+                " to perform bring-your-own-response(BYOR) evaluation. If you would"
+                " like to perform evaluation using the dataset with the"
+                " existing model response column or or baseline model response column"
+                f" `{evaluation_run_config.metric_column_mapping[response_column_name]}`,"
+                " please remove `model` parameter in `EvalTask.evaluate()`"
+                " function or `baseline_model` in `PairwiseMetric`."
+            )
+
+
+def _check_variable_columns_exist(
+    dataset: "pd.DataFrame", variable_names_set: Set[str]
+) -> None:
+    """Checks if all variable names exist in the dataset columns.
+
+    Args:
+        dataset: The dataset to evaluate.
+        variable_names_set: A set of variable names.
+
+    Raises:
+        ValueError: If any variable names do not exist in the dataset columns
+        or the prompt template is invalid.
+    """
+    actual_column_names_set = set(dataset.columns)
+    if not variable_names_set.issubset(actual_column_names_set):
+        missing_columns = variable_names_set - actual_column_names_set
+        raise ValueError(
+            "Failed to assemble prompt template: The following column(s) are"
+            f" missing: {', '.join(missing_columns)}. "
+            f"Please verify prompt_template variables {variable_names_set} and "
+            f"evaluation dataset column names {actual_column_names_set}."
+        )
+
+
+def _assemble_prompt_for_dataset(
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+    prompt_template: Union[prompt_template_base.PromptTemplate, str],
+) -> None:
+    """Assembles a prompt column in metrics_table from variable columns.
+
+    Args:
+        evaluation_run_config: Evaluation Run Configurations.
+        prompt_template:  A `PromptTemplate` object or a prompt template string
+          with variables that can be assembled from the evaluation dataset. The
+          variables can be represented in curly braces `{variable}`, and
+          must be included in the dataset columns if specified. The variable
+          names cannot contain spaces.
+
+    Returns:
+        The assembled prompt template string to send to the model.
+
+    Raises:
+        ValueError: If any variable names do not exist in the dataset columns
+        or the prompt template is invalid.
+    """
+    if not prompt_template:
+        raise ValueError("Prompt template cannot be an empty string.")
+
+    _LOGGER.info(
+        "Assembling prompts from the `prompt_template`. The `prompt` column in"
+        " the `EvalResult.metrics_table` has the assembled prompts used for model"
+        " response generation."
+    )
+    if isinstance(prompt_template, str):
+        prompt_template = prompt_template_base.PromptTemplate(prompt_template)
+    _check_variable_columns_exist(
+        evaluation_run_config.dataset, prompt_template.variables
+    )
+
+    try:
+        evaluation_run_config.dataset[
+            constants.Dataset.PROMPT_COLUMN
+        ] = evaluation_run_config.dataset.apply(
+            lambda row: str(
+                prompt_template.assemble(
+                    **row[list(prompt_template.variables)].astype(str).to_dict(),
+                )
+            ),
+            axis=1,
+        )
+        if (
+            constants.Dataset.PROMPT_COLUMN
+            in evaluation_run_config.metric_column_mapping
+            and evaluation_run_config.metric_column_mapping[
+                constants.Dataset.PROMPT_COLUMN
+            ]
+            != constants.Dataset.PROMPT_COLUMN
+        ):
+            _LOGGER.warning(
+                "The `prompt` column mapping provided in"
+                " `metric_column_mapping` parameter is overwritten by the"
+                " assembled `prompt` column because the `prompt_template`"
+                " parameter is provided. Please verify that you want to use"
+                " the assembled `prompt` column for evaluation."
+            )
+        evaluation_run_config.metric_column_mapping[
+            constants.Dataset.PROMPT_COLUMN
+        ] = constants.Dataset.PROMPT_COLUMN
+    except Exception as e:
+        raise ValueError(
+            f"Failed to assemble prompt template: {e}. Please make sure all"
+            " variables in `prompt_template` are present in the evaluation"
+            f" dataset columns: `{list(evaluation_run_config.dataset.columns)}`."
+        ) from e
+
+
+def _set_metric_table(
+    metric_name: str,
+    metric_results: Any,
+    metrics_table: "pd.DataFrame",
+    metric_result_key: str,
+):
+    """Parses value from metric results to metrics_table."""
+    if metric_result_key == constants.MetricResult.SCORE_KEY:
+        metric_result_items = [
+            result.get(metric_result_key) if isinstance(result, dict) else None
+            for result in metric_results
+        ]
+    else:
+        metric_result_items = [
+            result.get(metric_result_key) if isinstance(result, dict) else "Error"
+            for result in metric_results
+        ]
+    metrics_table[f"{metric_name}/{metric_result_key}"] = metric_result_items
+
+
+def _parse_metric_results_to_dataframe(
+    instance_df: "pd.DataFrame", results: Dict[Union[str, metrics_base._Metric], Any]
+) -> Dict[str, Any]:
+    """Parses metric results to a pandas dataframe.
+
+    Args:
+        instance_df: A dataframe containing per-instance metrics results.
+        results: A dictionary containing metric results.
+
+    Returns:
+        A dataframe containing per-instance metrics results. Each metric result
+        can contain metric score, explanation, and confidence.
+    """
+    try:
+        import pandas as pd
+    except ImportError:
+        raise ImportError(
+            'Pandas is not installed. Please install the SDK using "pip install'
+            ' google-cloud-aiplatform[evaluation]"'
+        )
+
+    metrics_table = pd.DataFrame(dict(zip(instance_df.columns, instance_df.values.T)))
+    for metric, metric_results in results.items():
+        if isinstance(metric, pointwise_metric.PointwiseMetric):
+            _set_metric_table(
+                metric.metric_name,
+                metric_results,
+                metrics_table,
+                constants.MetricResult.EXPLANATION_KEY,
+            )
+            _set_metric_table(
+                metric.metric_name,
+                metric_results,
+                metrics_table,
+                constants.MetricResult.SCORE_KEY,
+            )
+        elif isinstance(metric, pairwise_metric.PairwiseMetric):
+            _set_metric_table(
+                metric.metric_name,
+                metric_results,
+                metrics_table,
+                constants.MetricResult.EXPLANATION_KEY,
+            )
+            _set_metric_table(
+                metric.metric_name,
+                metric_results,
+                metrics_table,
+                constants.MetricResult.PAIRWISE_CHOICE_KEY,
+            )
+        elif str(metric) in constants.Metric.AUTOMATIC_METRIC_LIST:
+            _set_metric_table(
+                str(metric),
+                metric_results,
+                metrics_table,
+                constants.MetricResult.SCORE_KEY,
+            )
+        elif isinstance(
+            metric, metrics_base._TranslationMetric  # pylint: disable=protected-access
+        ):
+            _set_metric_table(
+                str(metric),
+                metric_results,
+                metrics_table,
+                constants.MetricResult.SCORE_KEY,
+            )
+        else:
+            _LOGGER.warning(
+                f"Metric name: {str(metric)} is not supported when parsing"
+                " metric results."
+            )
+
+    return metrics_table
+
+
+def _compute_metrics(
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+) -> Tuple[Dict[str, Any], "pd.DataFrame"]:
+    """Computes the metrics for the dataset.
+
+    Args:
+      evaluation_run_config: Evaluation Run Configurations.
+
+    Returns:
+      The evaluation results for the input metrics.
+
+    Raises:
+      RuntimeError: The number of responses does not match the number of metrics.
+    """
+    try:
+        import pandas as pd
+    except ImportError:
+        raise ImportError(
+            'Pandas is not installed. Please install the SDK using "pip install'
+            ' google-cloud-aiplatform[evaluation]"'
+        )
+
+    api_metrics, custom_metrics = _separate_custom_metrics(
+        evaluation_run_config.metrics
+    )
+    row_count = len(evaluation_run_config.dataset)
+    api_request_count = len(api_metrics) * row_count
+    custom_metric_request_count = len(custom_metrics) * row_count
+    total_request_count = api_request_count + custom_metric_request_count
+
+    _LOGGER.info(
+        f"Computing metrics with a total of {total_request_count} Vertex Gen AI"
+        " Evaluation Service API requests."
+    )
+
+    instance_list = []
+    futures_by_metric = collections.defaultdict(list)
+    rate_limiter = utils.RateLimiter(evaluation_run_config.evaluation_service_qps)
+    with tqdm(total=total_request_count) as pbar:
+        with futures.ThreadPoolExecutor(max_workers=constants.MAX_WORKERS) as executor:
+            for idx, row in evaluation_run_config.dataset.iterrows():
+                row_dict = _compute_custom_metrics(
+                    row.to_dict(), custom_metrics, pbar, executor
+                )
+                instance_list.append(row_dict)
+                for metric in api_metrics:
+                    future = executor.submit(
+                        _instance_evaluation.evaluate_instances,
+                        client=evaluation_run_config.client,
+                        request=_instance_evaluation.build_request(
+                            metric=metric,
+                            row_dict=row_dict,
+                            evaluation_run_config=evaluation_run_config,
+                        ),
+                        rate_limiter=rate_limiter,
+                        retry_timeout=evaluation_run_config.retry_timeout,
+                    )
+                    future.add_done_callback(lambda _: pbar.update(1))
+                    futures_by_metric[metric].append((future, idx))
+
+        # Retrieve results from all futures and handle errors.
+        results_dict = collections.defaultdict(list)
+        error_list = []
+        for metric, futures_list in futures_by_metric.items():
+            for future, index in futures_list:
+                try:
+                    response = future.result()
+                    results_dict[metric].append(response)
+                except Exception as e:
+                    results_dict[metric].append("Error")
+                    error_list.append((metric, index, f"Error: {e}"))
+
+    for metric, responses in results_dict.items():
+        results_dict[metric] = [
+            _instance_evaluation.handle_response(response) for response in responses
+        ]
+    if error_list:
+        _LOGGER.warning(
+            f"{len(error_list)} errors encountered during evaluation. Continue to"
+            " compute summary metrics for the rest of the dataset."
+        )
+        for metric_name, index, error in error_list:
+            _LOGGER.warning(
+                f"Error encountered for metric {metric_name} at dataset index"
+                f" {index}: {error}"
+            )
+    else:
+        _LOGGER.info(
+            f"All {total_request_count} metric requests are successfully computed."
+        )
+
+    instance_df = pd.DataFrame.from_dict(instance_list)
+    metrics_table = _parse_metric_results_to_dataframe(instance_df, results_dict)
+
+    # Aggregate the summary metrics.
+    summary_metrics = _aggregate_summary_metrics(evaluation_run_config, metrics_table)
+
+    return evaluation_base.EvalResult(
+        summary_metrics=summary_metrics, metrics_table=metrics_table
+    )
+
+
+def _get_baseline_model(evaluation_run_config: evaluation_base.EvaluationRunConfig):
+    """Gets the baseline model from the pairwise metrics."""
+    pairwise_metric_instances = [
+        metric
+        for metric in evaluation_run_config.metrics
+        if isinstance(metric, pairwise_metric.PairwiseMetric)
+    ]
+    baseline_models = {
+        instance.metric_name: instance.baseline_model
+        for instance in pairwise_metric_instances
+    }
+    if len(set(baseline_models.values())) > 1:
+        raise ValueError(
+            "Not all `PairwiseMetric` instances have the same `baseline_model`. "
+            f"Here are the detected baseline models: `{baseline_models}`. "
+            "Please separate pairwise metrics with different baseline models "
+            "in different `EvalTask` or use the same baseline model for "
+            "all pairwise metrics."
+        )
+    return pairwise_metric_instances[0].baseline_model
+
+
+def _convert_metric_prompt_template_example(metrics):
+    """Converts string metric names to generic model-based metric instances."""
+    updated_metrics = []
+    for metric in metrics:
+        if metric in constants.Metric.POINTWISE_METRIC_PROMPT_TEMPLATE_EXAMPLE_LIST:
+            template = metric_prompt_template_examples.MetricPromptTemplateExamples.get_prompt_template(
+                metric
+            )
+            metric = pointwise_metric.PointwiseMetric(
+                metric=metric, metric_prompt_template=template
+            )
+        elif metric in constants.Metric.PAIRWISE_METRIC_PROMPT_TEMPLATE_EXAMPLE_LIST:
+            template = metric_prompt_template_examples.MetricPromptTemplateExamples.get_prompt_template(
+                metric
+            )
+            metric = pairwise_metric.PairwiseMetric(
+                metric=metric, metric_prompt_template=template
+            )
+            _LOGGER.info(
+                f"Pairwise metric `{metric.metric_name}` loaded from"
+                " `MetricPromptTemplateExamples` does not have `baseline_model`"
+                " specified and only supports Bring-Your-Own-Response(BYOR)"
+                " evaluation. If you would like to run inference on the baseline model,"
+                " please instantiate a `PairwiseMetric` and provide the"
+                " `baseline_model` parameter."
+            )
+        updated_metrics.append(metric)
+    return updated_metrics
+
+
+def evaluate(
+    dataset: "pd.DataFrame",
+    metrics: List[Union[str, metrics_base._Metric]],
+    *,
+    model: Optional[
+        Union[generative_models.GenerativeModel, Callable[[str], str]]
+    ] = None,
+    prompt_template: Optional[Union[str, prompt_template_base.PromptTemplate]] = None,
+    metric_column_mapping: Dict[str, str],
+    evaluation_service_qps: Optional[float] = None,
+    retry_timeout: float = 600.0,
+) -> evaluation_base.EvalResult:
+    """Runs the evaluation for metrics.
+
+    Args:
+      dataset: The dataset to evaluate.
+      metrics: The list of metric names, or Metric instances to
+        evaluate. Prompt template is required for PairwiseMetric.
+      model: The GenerativeModel instance or a custom model function to generate
+        responses to evaluate. If not provided, the evaluation is computed with
+        the `response` column in the `dataset`.
+      prompt_template: A `PromptTemplate` or a prompt template string compatible
+        with `PromptTemplate` class with variables that can be formatted with
+        dataset columns to create assembled prompts. The variables can be
+        represented in curly braces `{variable_name}`, and must be included in the
+        dataset columns if specified. The variable names cannot contain spaces.
+      metric_column_mapping: An optional dictionary column mapping that
+        overrides the metric prompt template input variable names with
+        mapped the evaluation dataset column names, used during evaluation.
+        For example, if the input_variables of the metric prompt template
+        are ["context", "reference"], the metric_column_mapping can be
+              {
+                  "context": "news_context",
+                  "reference": "ground_truth",
+                  "response": "model_1_response"
+              }
+            if the dataset has columns "news_context", "ground_truth" and
+            "model_1_response".
+      evaluation_service_qps: The custom QPS limit for the evaluation service.
+      retry_timeout: How long to keep retrying the evaluation requests for the
+        whole evaluation dataset, in seconds.
+
+    Returns:
+      EvalResult with summary metrics and a metrics table for per-instance
+      metrics.
+
+    Raises:
+      ValueError: If the metrics list is empty, or the prompt template is not
+        provided for PairwiseMetric, or multiple baseline models are specified for
+        PairwiseMetric instances, or both model and dataset model response column
+        are present.
+    """
+    _validate_metrics(metrics)
+    metrics = _convert_metric_prompt_template_example(metrics)
+    copied_metrics = []
+    for metric in metrics:
+        if isinstance(metric, pairwise_metric.PairwiseMetric):
+            copied_metrics.append(
+                pairwise_metric.PairwiseMetric(
+                    metric=metric.metric_name,
+                    metric_prompt_template=metric.metric_prompt_template,
+                    baseline_model=metric.baseline_model,
+                )
+            )
+        else:
+            copied_metrics.append(copy.deepcopy(metric))
+    evaluation_run_config = evaluation_base.EvaluationRunConfig(
+        dataset=dataset.copy(deep=True),
+        metrics=copied_metrics,
+        metric_column_mapping=copy.deepcopy(metric_column_mapping),
+        client=utils.create_evaluation_service_client(),
+        evaluation_service_qps=(
+            evaluation_service_qps
+            if evaluation_service_qps
+            else constants.QuotaLimit.EVAL_SERVICE_QPS
+        ),
+        retry_timeout=retry_timeout,
+    )
+
+    if prompt_template:
+        _assemble_prompt_for_dataset(evaluation_run_config, prompt_template)
+
+    _run_model_inference(
+        model=model,
+        evaluation_run_config=evaluation_run_config,
+        response_column_name=constants.Dataset.MODEL_RESPONSE_COLUMN,
+    )
+    _validate_dataset(evaluation_run_config)
+
+    pairwise_metric_exists = any(
+        isinstance(metric, pairwise_metric.PairwiseMetric)
+        for metric in evaluation_run_config.metrics
+    )
+    if pairwise_metric_exists:
+        baseline_model = _get_baseline_model(evaluation_run_config)
+        _run_model_inference(
+            model=baseline_model,
+            evaluation_run_config=evaluation_run_config,
+            response_column_name=constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
+        )
+
+    _validate_metric_column_map(evaluation_run_config)
+    t1 = time.perf_counter()
+    evaluation_result = _compute_metrics(evaluation_run_config)
+    t2 = time.perf_counter()
+    _LOGGER.info(f"Evaluation Took:{t2 - t1} seconds")
+
+    return evaluation_result