evo-ai/.venv/lib/python3.10/site-packages/vertexai/evaluation/eval_task.py

# -*- coding: utf-8 -*-

# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from typing import Any, Callable, Dict, List, Literal, Optional, TYPE_CHECKING, Union
import uuid
import warnings

from google.api_core import exceptions
import vertexai
from google.cloud.aiplatform import base
from google.cloud.aiplatform import utils
from google.cloud.aiplatform.metadata import metadata
from vertexai import generative_models
from vertexai.evaluation import _base as eval_base
from vertexai.evaluation import _evaluation
from vertexai.evaluation import constants
from vertexai.evaluation import utils as eval_utils
from vertexai.evaluation.metrics import (
    _base as metrics_base,
)
from vertexai.evaluation.metrics import (
    pairwise_metric,
)
from vertexai.evaluation.metrics import (
    pointwise_metric,
)
import numpy as np

if TYPE_CHECKING:
    import pandas as pd

# pylint: disable=g-import-not-at-top
try:
    from IPython import display as IPython_display
except ImportError:
    IPython_display = None

_LOGGER = base.Logger(__name__)
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

EvalResult = eval_base.EvalResult
GenerativeModel = generative_models.GenerativeModel


class EvalTask:
    """A class representing an EvalTask.

    An evaluation task assesses the ability of a Gen AI model, agent or
    application to perform a specific task in response to prompts.
    Each evaluation task includes an evaluation dataset, which can be a set of
    test cases and a set of metrics for assessment. These tasks provide the
    framework for running evaluations in a standardized and repeatable way,
    allowing for comparative assessment with varying run-specific parameters.


    Dataset Details:

        Default dataset column names:
            * prompt_column_name: "prompt"
            * reference_column_name: "reference"
            * response_column_name: "response"
            * baseline_model_response_column_name: "baseline_model_response"
            * rubrics_column_name: "rubrics"


        Requirement for different use cases:
          * Bring-your-own-response (BYOR): You already have the data that you
              want to evaluate stored in the dataset. Response column name can be
              customized by providing `response_column_name` parameter, or in the
              `metric_column_mapping`. For BYOR pairwise evaluation, the baseline
              model response column name can be customized by providing
              `baseline_model_response_column_name` parameter, or
              in the `metric_column_mapping`. If the `response` column or
              `baseline_model_response` column is present while the
              corresponding model is specified, an error will be raised.

          * Perform model/agent inference without a prompt template: You have a dataset
              containing the input prompts to the model/agent and want to perform
              inference before evaluation. A column named `prompt` is required
              in the evaluation dataset and is used directly as input to the model/agent.

          * Perform model/agent inference with a prompt template: You have a dataset
              containing the input variables to the prompt template and want to
              assemble the prompts for inference. Evaluation dataset
              must contain column names corresponding to the variable names in
              the prompt template. For example, if prompt template is
              "Instruction: {instruction}, context: {context}", the dataset must
              contain `instruction` and `context` columns.

    Metrics Details:

        The supported metrics descriptions, rating rubrics, and the required
        input variables can be found on the Vertex AI public documentation page.
        [Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval).

    Usage Examples:

        1. To perform bring-your-own-response(BYOR) evaluation, provide the model
        responses in the `response` column in the dataset. If a pairwise metric is
        used for BYOR evaluation, provide the baseline model responses in the
        `baseline_model_response` column.

          ```
          eval_dataset = pd.DataFrame({
                  "prompt"  : [...],
                  "reference": [...],
                  "response" : [...],
                  "baseline_model_response": [...],
          })
          eval_task = EvalTask(
            dataset=eval_dataset,
            metrics=[
                    "bleu",
                    "rouge_l_sum",
                    MetricPromptTemplateExamples.Pointwise.FLUENCY,
                    MetricPromptTemplateExamples.Pairwise.SAFETY
            ],
            experiment="my-experiment",
          )
          eval_result = eval_task.evaluate(experiment_run_name="eval-experiment-run")
          ```

        2. To perform evaluation with Gemini model inference, specify the `model`
        parameter with a `GenerativeModel` instance.  The input column name to the
        model is `prompt` and must be present in the dataset.

          ```
          eval_dataset = pd.DataFrame({
                "reference": [...],
                "prompt"  : [...],
          })
          result = EvalTask(
              dataset=eval_dataset,
              metrics=["exact_match", "bleu", "rouge_1", "rouge_l_sum"],
              experiment="my-experiment",
          ).evaluate(
              model=GenerativeModel("gemini-1.5-pro"),
              experiment_run_name="gemini-eval-run"
          )
          ```

        3. If a `prompt_template` is specified, the `prompt` column is not required.
        Prompts can be assembled from the evaluation dataset, and all prompt
        template variable names must be present in the dataset columns.
          ```
          eval_dataset = pd.DataFrame({
              "context"    : [...],
              "instruction": [...],
          })
          result = EvalTask(
              dataset=eval_dataset,
              metrics=[MetricPromptTemplateExamples.Pointwise.SUMMARIZATION_QUALITY],
          ).evaluate(
              model=GenerativeModel("gemini-1.5-pro"),
              prompt_template="{instruction}. Article: {context}. Summary:",
          )
          ```

        4. To perform evaluation with custom model inference, specify the `model`
        parameter with a custom inference function. The input column name to the
        custom inference function is `prompt` and must be present in the dataset.

          ```
          from openai import OpenAI
          client = OpenAI()
          def custom_model_fn(input: str) -> str:
            response = client.chat.completions.create(
              model="gpt-3.5-turbo",
              messages=[
                {"role": "user", "content": input}
              ]
            )
            return response.choices[0].message.content

          eval_dataset = pd.DataFrame({
                "prompt"  : [...],
                "reference": [...],
          })
          result = EvalTask(
              dataset=eval_dataset,
              metrics=[MetricPromptTemplateExamples.Pointwise.SAFETY],
              experiment="my-experiment",
          ).evaluate(
              model=custom_model_fn,
              experiment_run_name="gpt-eval-run"
          )
          ```

        5. To perform pairwise metric evaluation with model inference step, specify
        the `baseline_model` input to a `PairwiseMetric` instance and the candidate
        `model` input to the `EvalTask.evaluate()` function. The input column name
        to both models is `prompt` and must be present in the dataset.

          ```
          baseline_model = GenerativeModel("gemini-1.0-pro")
          candidate_model = GenerativeModel("gemini-1.5-pro")

          pairwise_groundedness = PairwiseMetric(
              metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template(
                  "pairwise_groundedness"
              ),
              baseline_model=baseline_model,
          )
          eval_dataset = pd.DataFrame({
                "prompt"  : [...],
          })
          result = EvalTask(
              dataset=eval_dataset,
              metrics=[pairwise_groundedness],
              experiment="my-pairwise-experiment",
          ).evaluate(
              model=candidate_model,
              experiment_run_name="gemini-pairwise-eval-run",
          )
          ```
    """

    _resource_noun = "evaluationTasks"

    def __init__(
        self,
        *,
        dataset: Union["pd.DataFrame", str, Dict[str, Any]],
        metrics: List[
            Union[
                Literal[
                    "exact_match",
                    "bleu",
                    "rouge_1",
                    "rouge_2",
                    "rouge_l",
                    "rouge_l_sum",
                    "tool_call_valid",
                    "tool_name_match",
                    "tool_parameter_key_match",
                    "tool_parameter_kv_match",
                ],
                metrics_base.CustomMetric,
                metrics_base._AutomaticMetric,
                metrics_base._TranslationMetric,
                pointwise_metric.PointwiseMetric,
                pairwise_metric.PairwiseMetric,
            ]
        ],
        experiment: Optional[str] = None,
        metric_column_mapping: Optional[Dict[str, str]] = None,
        output_uri_prefix: Optional[str] = "",
    ):
        """Initializes an EvalTask.

        Args:
            dataset: The dataset to be evaluated.
                Supports the following dataset formats:
                * pandas.DataFrame: Used directly for evaluation.
                * Dict: Converted to a pandas DataFrame before evaluation.
                * str: Interpreted as a file path or URI. Supported formats include:
                    * Local JSONL or CSV files:  Loaded from the local filesystem.
                    * GCS JSONL or CSV files: Loaded from Google Cloud Storage
                        (e.g., 'gs://bucket/data.csv').
                    * BigQuery table URI: Loaded from Google Cloud BigQuery
                        (e.g., 'bq://project-id.dataset.table_name').
            metrics: The list of metric names, or Metric instances to evaluate.
              Prompt template is required for PairwiseMetric.
            experiment: The name of the experiment to log the evaluations to.
            metric_column_mapping: An optional dictionary column mapping that
              overrides the metric prompt template input variable names with
              mapped the evaluation dataset column names, used during evaluation.
              For example, if the input_variables of the metric prompt template
              are ["context", "reference"], the metric_column_mapping can be
                {
                    "context": "news_context",
                    "reference": "ground_truth",
                    "response": "model_1_response"
                }
              if the dataset has columns "news_context", "ground_truth" and
              "model_1_response".
            output_uri_prefix: GCS location to store the metrics_table from
              evaluation results.
        """
        self._raw_dataset = dataset
        self._dataset = eval_utils.load_dataset(dataset)
        self._metrics = metrics
        self._experiment = experiment
        self._metric_column_mapping = eval_utils.initialize_metric_column_mapping(
            metric_column_mapping, self._dataset
        )
        self.output_uri_prefix = output_uri_prefix

    @property
    def dataset(self) -> "pd.DataFrame":
        """Returns evaluation dataset."""
        return self._dataset

    @property
    def metrics(self) -> List[Union[str, metrics_base.CustomMetric]]:
        """Returns metrics."""
        return self._metrics

    @property
    def experiment(self) -> Optional[str]:
        """Returns experiment name."""
        return self._experiment

    def _evaluate_with_experiment(
        self,
        *,
        model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None,
        prompt_template: Optional[str] = None,
        experiment_run_name: Optional[str] = None,
        evaluation_service_qps: Optional[float] = None,
        retry_timeout: float = 120.0,
        output_file_name: Optional[str] = None,
    ) -> EvalResult:
        """Runs an evaluation for the EvalTask with an experiment.

        Args:
          model: A GenerativeModel instance or a custom model function to generate
            responses to evaluate. If not provided, the evaluation is computed with
            the `response` column in the `dataset`.
          prompt_template: The prompt template to use for the evaluation. If not
            set, the prompt template that was used to create the EvalTask will be
            used.
          experiment_run_name: The name of the experiment run to log the evaluation
            to if an experiment is set for this EvalTask. If not provided, a random
            unique experiment run name is used.
          evaluation_service_qps: The custom QPS limit for the evaluation service.
          retry_timeout: How long to keep retrying the evaluation requests for
            the whole evaluation dataset, in seconds.
          output_file_name: The file name with csv suffix to store the output
            metrics_table to be tracked in the experiment run.

        Returns:
          The evaluation result.
        """
        self._validate_experiment_run()
        with vertexai.preview.start_run(experiment_run_name):
            self._log_eval_experiment_param(
                model=model,
                prompt_template=prompt_template,
                output_file_name=output_file_name,
            )
            eval_result = _evaluation.evaluate(
                dataset=self._dataset,
                metrics=self._metrics,
                model=model,
                prompt_template=prompt_template,
                metric_column_mapping=self._metric_column_mapping,
                evaluation_service_qps=evaluation_service_qps,
                retry_timeout=retry_timeout,
            )

            eval_result.summary_metrics = {
                k: ("NaN" if isinstance(v, float) and np.isnan(v) else v)
                for k, v in eval_result.summary_metrics.items()
            }
            eval_result.metadata = {
                "experiment": self._experiment,
                "experiment_run": experiment_run_name,
            }
            try:
                vertexai.preview.log_metrics(eval_result.summary_metrics)
            except (TypeError, exceptions.InvalidArgument) as e:
                _LOGGER.warning(f"Experiment metrics logging failed: {str(e)}")
        return eval_result

    def evaluate(
        self,
        *,
        model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None,
        prompt_template: Optional[str] = None,
        experiment_run_name: Optional[str] = None,
        response_column_name: Optional[str] = None,
        baseline_model_response_column_name: Optional[str] = None,
        evaluation_service_qps: Optional[float] = None,
        retry_timeout: float = 120.0,
        output_file_name: Optional[str] = None,
    ) -> EvalResult:
        """Runs an evaluation for the EvalTask.

        Args:
          model: A GenerativeModel instance or a custom model function to generate
            responses to evaluate. If not provided, the evaluation can be performed
            in the bring-your-own-response (BYOR) mode.
          prompt_template: The prompt template to use for the evaluation. If not
            set, the prompt template that was used to create the EvalTask will be
            used.
          experiment_run_name: The name of the experiment run to log the evaluation
            to if an experiment is set for this EvalTask. If not provided, a random
            unique experiment run name is used.
          response_column_name: The column name of model response in the dataset. If
            provided, this will override the `metric_column_mapping` of the `EvalTask`.
          baseline_model_response_column_name: The column name of baseline model
            response in the dataset for pairwise metrics. If provided, this will
            override the `metric_column_mapping` of the `EvalTask`
          evaluation_service_qps: The custom QPS limit for the evaluation service.
          retry_timeout: How long to keep retrying the evaluation requests for
            the whole evaluation dataset, in seconds.
          output_file_name: The file name with csv suffix to store the output
            metrics_table.

        Returns:
          The evaluation result.
        """
        global_experiment_name = metadata._experiment_tracker.experiment_name
        if experiment_run_name and not self._experiment and not global_experiment_name:
            raise ValueError(
                "Experiment is not set. Please initialize `EvalTask` with an"
                " experiment, or initialize a global experiment with "
                "`vertexai.init(experiment='experiment_name')`for logging this"
                " evaluation run."
            )
        if self.output_uri_prefix and not output_file_name:
            output_file_name = f"eval_results_{utils.timestamped_unique_name()}.csv"
        self._verify_and_set_response_column_name(
            response_column_name=response_column_name,
            metric_column_mapping_key=constants.Dataset.MODEL_RESPONSE_COLUMN,
        )
        self._verify_and_set_response_column_name(
            response_column_name=baseline_model_response_column_name,
            metric_column_mapping_key=constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
        )

        experiment_run_name = experiment_run_name or f"{uuid.uuid4()}"
        if self._experiment and global_experiment_name:
            metadata._experiment_tracker.set_experiment(
                experiment=self._experiment, backing_tensorboard=False
            )
            eval_result = self._evaluate_with_experiment(
                model=model,
                prompt_template=prompt_template,
                experiment_run_name=experiment_run_name,
                evaluation_service_qps=evaluation_service_qps,
                retry_timeout=retry_timeout,
                output_file_name=output_file_name,
            )
            metadata._experiment_tracker.set_experiment(
                experiment=global_experiment_name,
                backing_tensorboard=False,
                display_button=False,
            )
        elif self._experiment and not global_experiment_name:
            metadata._experiment_tracker.set_experiment(
                experiment=self._experiment, backing_tensorboard=False
            )
            eval_result = self._evaluate_with_experiment(
                model=model,
                prompt_template=prompt_template,
                experiment_run_name=experiment_run_name,
                evaluation_service_qps=evaluation_service_qps,
                retry_timeout=retry_timeout,
                output_file_name=output_file_name,
            )
            metadata._experiment_tracker.reset()
        elif not self._experiment and global_experiment_name:
            eval_result = self._evaluate_with_experiment(
                model=model,
                prompt_template=prompt_template,
                experiment_run_name=experiment_run_name,
                evaluation_service_qps=evaluation_service_qps,
                retry_timeout=retry_timeout,
                output_file_name=output_file_name,
            )
        else:
            eval_result = _evaluation.evaluate(
                dataset=self.dataset,
                metrics=self.metrics,
                model=model,
                prompt_template=prompt_template,
                metric_column_mapping=self._metric_column_mapping,
                evaluation_service_qps=evaluation_service_qps,
                retry_timeout=retry_timeout,
            )

        candidate_model_name = None
        if isinstance(model, generative_models.GenerativeModel):
            candidate_model_name = model._model_name

        baseline_model_name = None
        pairwise_metrics = [
            metric
            for metric in self.metrics
            if isinstance(metric, pairwise_metric.PairwiseMetric)
        ]
        if pairwise_metrics:
            # All pairwise metrics should have the same baseline model.
            baseline_model = pairwise_metrics[0].baseline_model
            if isinstance(baseline_model, generative_models.GenerativeModel):
                baseline_model_name = baseline_model._model_name

        dataset_uri = None
        if isinstance(self._raw_dataset, str):
            dataset_uri = self._raw_dataset

        eval_utils.upload_evaluation_results(
            eval_result,
            self.output_uri_prefix,
            output_file_name,
            candidate_model_name,
            baseline_model_name,
            dataset_uri,
            self.metrics,
        )
        return eval_result

    def _validate_experiment_run(self) -> None:
        """Checks if an experiment run already exists."""
        if metadata._experiment_tracker.experiment_run:
            raise ValueError(
                "Experiment run already exists. Please specify the name of the"
                " experiment run to assign current session within this evaluation."
            )

    def _log_eval_experiment_param(
        self,
        model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None,
        prompt_template: Optional[str] = None,
        output_file_name: Optional[str] = None,
    ) -> None:
        """Logs variable input parameters of an evaluation to an experiment run."""
        eval_metadata = {}

        if prompt_template is not None:
            eval_metadata.update({"prompt_template": prompt_template})

        if isinstance(model, GenerativeModel):
            eval_metadata.update(
                {
                    "model_name": model._model_name,
                }
            )

            if model._generation_config and isinstance(model._generation_config, dict):
                eval_metadata.update(**model._generation_config)

            if model._safety_settings and isinstance(model._safety_settings, dict):
                safety_settings = model._safety_settings
                safety_settings_as_str = {
                    category.name: threshold.name
                    for category, threshold in safety_settings.items()
                }
                eval_metadata.update(safety_settings_as_str)

        if self.output_uri_prefix and output_file_name:
            eval_metadata.update(
                {"output_file": self.output_uri_prefix + "/" + output_file_name}
            )

        if eval_metadata:
            _LOGGER.info(f"Logging Eval Experiment metadata: {eval_metadata}")
            try:
                vertexai.preview.log_params(eval_metadata)
            except (ValueError, TypeError) as e:
                _LOGGER.warning(f"Experiment metadata logging failed: {str(e)}")

    def _verify_and_set_response_column_name(
        self, response_column_name: str, metric_column_mapping_key: str
    ) -> None:
        """Verifies and sets the model response column names."""
        if response_column_name:
            if response_column_name in self._dataset.columns:
                self._metric_column_mapping[
                    metric_column_mapping_key
                ] = response_column_name
            else:
                raise ValueError(
                    f"(Baseline) Model response column {response_column_name} is not"
                    " found in the dataset."
                )

    def display_runs(self):
        """Displays experiment runs associated with this EvalTask."""
        if not self._experiment:
            raise ValueError("Experiment is not set.")
        elif IPython_display:
            IPython_display.display(
                vertexai.preview.get_experiment_df(self._experiment)
            )