Files
evo-ai/.venv/lib/python3.10/site-packages/vertexai/evaluation/eval_task.py
2025-04-25 15:30:54 -03:00

593 lines
24 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from typing import Any, Callable, Dict, List, Literal, Optional, TYPE_CHECKING, Union
import uuid
import warnings
from google.api_core import exceptions
import vertexai
from google.cloud.aiplatform import base
from google.cloud.aiplatform import utils
from google.cloud.aiplatform.metadata import metadata
from vertexai import generative_models
from vertexai.evaluation import _base as eval_base
from vertexai.evaluation import _evaluation
from vertexai.evaluation import constants
from vertexai.evaluation import utils as eval_utils
from vertexai.evaluation.metrics import (
_base as metrics_base,
)
from vertexai.evaluation.metrics import (
pairwise_metric,
)
from vertexai.evaluation.metrics import (
pointwise_metric,
)
import numpy as np
if TYPE_CHECKING:
import pandas as pd
# pylint: disable=g-import-not-at-top
try:
from IPython import display as IPython_display
except ImportError:
IPython_display = None
_LOGGER = base.Logger(__name__)
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")
EvalResult = eval_base.EvalResult
GenerativeModel = generative_models.GenerativeModel
class EvalTask:
"""A class representing an EvalTask.
An evaluation task assesses the ability of a Gen AI model, agent or
application to perform a specific task in response to prompts.
Each evaluation task includes an evaluation dataset, which can be a set of
test cases and a set of metrics for assessment. These tasks provide the
framework for running evaluations in a standardized and repeatable way,
allowing for comparative assessment with varying run-specific parameters.
Dataset Details:
Default dataset column names:
* prompt_column_name: "prompt"
* reference_column_name: "reference"
* response_column_name: "response"
* baseline_model_response_column_name: "baseline_model_response"
* rubrics_column_name: "rubrics"
Requirement for different use cases:
* Bring-your-own-response (BYOR): You already have the data that you
want to evaluate stored in the dataset. Response column name can be
customized by providing `response_column_name` parameter, or in the
`metric_column_mapping`. For BYOR pairwise evaluation, the baseline
model response column name can be customized by providing
`baseline_model_response_column_name` parameter, or
in the `metric_column_mapping`. If the `response` column or
`baseline_model_response` column is present while the
corresponding model is specified, an error will be raised.
* Perform model/agent inference without a prompt template: You have a dataset
containing the input prompts to the model/agent and want to perform
inference before evaluation. A column named `prompt` is required
in the evaluation dataset and is used directly as input to the model/agent.
* Perform model/agent inference with a prompt template: You have a dataset
containing the input variables to the prompt template and want to
assemble the prompts for inference. Evaluation dataset
must contain column names corresponding to the variable names in
the prompt template. For example, if prompt template is
"Instruction: {instruction}, context: {context}", the dataset must
contain `instruction` and `context` columns.
Metrics Details:
The supported metrics descriptions, rating rubrics, and the required
input variables can be found on the Vertex AI public documentation page.
[Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval).
Usage Examples:
1. To perform bring-your-own-response(BYOR) evaluation, provide the model
responses in the `response` column in the dataset. If a pairwise metric is
used for BYOR evaluation, provide the baseline model responses in the
`baseline_model_response` column.
```
eval_dataset = pd.DataFrame({
"prompt" : [...],
"reference": [...],
"response" : [...],
"baseline_model_response": [...],
})
eval_task = EvalTask(
dataset=eval_dataset,
metrics=[
"bleu",
"rouge_l_sum",
MetricPromptTemplateExamples.Pointwise.FLUENCY,
MetricPromptTemplateExamples.Pairwise.SAFETY
],
experiment="my-experiment",
)
eval_result = eval_task.evaluate(experiment_run_name="eval-experiment-run")
```
2. To perform evaluation with Gemini model inference, specify the `model`
parameter with a `GenerativeModel` instance. The input column name to the
model is `prompt` and must be present in the dataset.
```
eval_dataset = pd.DataFrame({
"reference": [...],
"prompt" : [...],
})
result = EvalTask(
dataset=eval_dataset,
metrics=["exact_match", "bleu", "rouge_1", "rouge_l_sum"],
experiment="my-experiment",
).evaluate(
model=GenerativeModel("gemini-1.5-pro"),
experiment_run_name="gemini-eval-run"
)
```
3. If a `prompt_template` is specified, the `prompt` column is not required.
Prompts can be assembled from the evaluation dataset, and all prompt
template variable names must be present in the dataset columns.
```
eval_dataset = pd.DataFrame({
"context" : [...],
"instruction": [...],
})
result = EvalTask(
dataset=eval_dataset,
metrics=[MetricPromptTemplateExamples.Pointwise.SUMMARIZATION_QUALITY],
).evaluate(
model=GenerativeModel("gemini-1.5-pro"),
prompt_template="{instruction}. Article: {context}. Summary:",
)
```
4. To perform evaluation with custom model inference, specify the `model`
parameter with a custom inference function. The input column name to the
custom inference function is `prompt` and must be present in the dataset.
```
from openai import OpenAI
client = OpenAI()
def custom_model_fn(input: str) -> str:
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": input}
]
)
return response.choices[0].message.content
eval_dataset = pd.DataFrame({
"prompt" : [...],
"reference": [...],
})
result = EvalTask(
dataset=eval_dataset,
metrics=[MetricPromptTemplateExamples.Pointwise.SAFETY],
experiment="my-experiment",
).evaluate(
model=custom_model_fn,
experiment_run_name="gpt-eval-run"
)
```
5. To perform pairwise metric evaluation with model inference step, specify
the `baseline_model` input to a `PairwiseMetric` instance and the candidate
`model` input to the `EvalTask.evaluate()` function. The input column name
to both models is `prompt` and must be present in the dataset.
```
baseline_model = GenerativeModel("gemini-1.0-pro")
candidate_model = GenerativeModel("gemini-1.5-pro")
pairwise_groundedness = PairwiseMetric(
metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template(
"pairwise_groundedness"
),
baseline_model=baseline_model,
)
eval_dataset = pd.DataFrame({
"prompt" : [...],
})
result = EvalTask(
dataset=eval_dataset,
metrics=[pairwise_groundedness],
experiment="my-pairwise-experiment",
).evaluate(
model=candidate_model,
experiment_run_name="gemini-pairwise-eval-run",
)
```
"""
_resource_noun = "evaluationTasks"
def __init__(
self,
*,
dataset: Union["pd.DataFrame", str, Dict[str, Any]],
metrics: List[
Union[
Literal[
"exact_match",
"bleu",
"rouge_1",
"rouge_2",
"rouge_l",
"rouge_l_sum",
"tool_call_valid",
"tool_name_match",
"tool_parameter_key_match",
"tool_parameter_kv_match",
],
metrics_base.CustomMetric,
metrics_base._AutomaticMetric,
metrics_base._TranslationMetric,
pointwise_metric.PointwiseMetric,
pairwise_metric.PairwiseMetric,
]
],
experiment: Optional[str] = None,
metric_column_mapping: Optional[Dict[str, str]] = None,
output_uri_prefix: Optional[str] = "",
):
"""Initializes an EvalTask.
Args:
dataset: The dataset to be evaluated.
Supports the following dataset formats:
* pandas.DataFrame: Used directly for evaluation.
* Dict: Converted to a pandas DataFrame before evaluation.
* str: Interpreted as a file path or URI. Supported formats include:
* Local JSONL or CSV files: Loaded from the local filesystem.
* GCS JSONL or CSV files: Loaded from Google Cloud Storage
(e.g., 'gs://bucket/data.csv').
* BigQuery table URI: Loaded from Google Cloud BigQuery
(e.g., 'bq://project-id.dataset.table_name').
metrics: The list of metric names, or Metric instances to evaluate.
Prompt template is required for PairwiseMetric.
experiment: The name of the experiment to log the evaluations to.
metric_column_mapping: An optional dictionary column mapping that
overrides the metric prompt template input variable names with
mapped the evaluation dataset column names, used during evaluation.
For example, if the input_variables of the metric prompt template
are ["context", "reference"], the metric_column_mapping can be
{
"context": "news_context",
"reference": "ground_truth",
"response": "model_1_response"
}
if the dataset has columns "news_context", "ground_truth" and
"model_1_response".
output_uri_prefix: GCS location to store the metrics_table from
evaluation results.
"""
self._raw_dataset = dataset
self._dataset = eval_utils.load_dataset(dataset)
self._metrics = metrics
self._experiment = experiment
self._metric_column_mapping = eval_utils.initialize_metric_column_mapping(
metric_column_mapping, self._dataset
)
self.output_uri_prefix = output_uri_prefix
@property
def dataset(self) -> "pd.DataFrame":
"""Returns evaluation dataset."""
return self._dataset
@property
def metrics(self) -> List[Union[str, metrics_base.CustomMetric]]:
"""Returns metrics."""
return self._metrics
@property
def experiment(self) -> Optional[str]:
"""Returns experiment name."""
return self._experiment
def _evaluate_with_experiment(
self,
*,
model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None,
prompt_template: Optional[str] = None,
experiment_run_name: Optional[str] = None,
evaluation_service_qps: Optional[float] = None,
retry_timeout: float = 120.0,
output_file_name: Optional[str] = None,
) -> EvalResult:
"""Runs an evaluation for the EvalTask with an experiment.
Args:
model: A GenerativeModel instance or a custom model function to generate
responses to evaluate. If not provided, the evaluation is computed with
the `response` column in the `dataset`.
prompt_template: The prompt template to use for the evaluation. If not
set, the prompt template that was used to create the EvalTask will be
used.
experiment_run_name: The name of the experiment run to log the evaluation
to if an experiment is set for this EvalTask. If not provided, a random
unique experiment run name is used.
evaluation_service_qps: The custom QPS limit for the evaluation service.
retry_timeout: How long to keep retrying the evaluation requests for
the whole evaluation dataset, in seconds.
output_file_name: The file name with csv suffix to store the output
metrics_table to be tracked in the experiment run.
Returns:
The evaluation result.
"""
self._validate_experiment_run()
with vertexai.preview.start_run(experiment_run_name):
self._log_eval_experiment_param(
model=model,
prompt_template=prompt_template,
output_file_name=output_file_name,
)
eval_result = _evaluation.evaluate(
dataset=self._dataset,
metrics=self._metrics,
model=model,
prompt_template=prompt_template,
metric_column_mapping=self._metric_column_mapping,
evaluation_service_qps=evaluation_service_qps,
retry_timeout=retry_timeout,
)
eval_result.summary_metrics = {
k: ("NaN" if isinstance(v, float) and np.isnan(v) else v)
for k, v in eval_result.summary_metrics.items()
}
eval_result.metadata = {
"experiment": self._experiment,
"experiment_run": experiment_run_name,
}
try:
vertexai.preview.log_metrics(eval_result.summary_metrics)
except (TypeError, exceptions.InvalidArgument) as e:
_LOGGER.warning(f"Experiment metrics logging failed: {str(e)}")
return eval_result
def evaluate(
self,
*,
model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None,
prompt_template: Optional[str] = None,
experiment_run_name: Optional[str] = None,
response_column_name: Optional[str] = None,
baseline_model_response_column_name: Optional[str] = None,
evaluation_service_qps: Optional[float] = None,
retry_timeout: float = 120.0,
output_file_name: Optional[str] = None,
) -> EvalResult:
"""Runs an evaluation for the EvalTask.
Args:
model: A GenerativeModel instance or a custom model function to generate
responses to evaluate. If not provided, the evaluation can be performed
in the bring-your-own-response (BYOR) mode.
prompt_template: The prompt template to use for the evaluation. If not
set, the prompt template that was used to create the EvalTask will be
used.
experiment_run_name: The name of the experiment run to log the evaluation
to if an experiment is set for this EvalTask. If not provided, a random
unique experiment run name is used.
response_column_name: The column name of model response in the dataset. If
provided, this will override the `metric_column_mapping` of the `EvalTask`.
baseline_model_response_column_name: The column name of baseline model
response in the dataset for pairwise metrics. If provided, this will
override the `metric_column_mapping` of the `EvalTask`
evaluation_service_qps: The custom QPS limit for the evaluation service.
retry_timeout: How long to keep retrying the evaluation requests for
the whole evaluation dataset, in seconds.
output_file_name: The file name with csv suffix to store the output
metrics_table.
Returns:
The evaluation result.
"""
global_experiment_name = metadata._experiment_tracker.experiment_name
if experiment_run_name and not self._experiment and not global_experiment_name:
raise ValueError(
"Experiment is not set. Please initialize `EvalTask` with an"
" experiment, or initialize a global experiment with "
"`vertexai.init(experiment='experiment_name')`for logging this"
" evaluation run."
)
if self.output_uri_prefix and not output_file_name:
output_file_name = f"eval_results_{utils.timestamped_unique_name()}.csv"
self._verify_and_set_response_column_name(
response_column_name=response_column_name,
metric_column_mapping_key=constants.Dataset.MODEL_RESPONSE_COLUMN,
)
self._verify_and_set_response_column_name(
response_column_name=baseline_model_response_column_name,
metric_column_mapping_key=constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
)
experiment_run_name = experiment_run_name or f"{uuid.uuid4()}"
if self._experiment and global_experiment_name:
metadata._experiment_tracker.set_experiment(
experiment=self._experiment, backing_tensorboard=False
)
eval_result = self._evaluate_with_experiment(
model=model,
prompt_template=prompt_template,
experiment_run_name=experiment_run_name,
evaluation_service_qps=evaluation_service_qps,
retry_timeout=retry_timeout,
output_file_name=output_file_name,
)
metadata._experiment_tracker.set_experiment(
experiment=global_experiment_name,
backing_tensorboard=False,
display_button=False,
)
elif self._experiment and not global_experiment_name:
metadata._experiment_tracker.set_experiment(
experiment=self._experiment, backing_tensorboard=False
)
eval_result = self._evaluate_with_experiment(
model=model,
prompt_template=prompt_template,
experiment_run_name=experiment_run_name,
evaluation_service_qps=evaluation_service_qps,
retry_timeout=retry_timeout,
output_file_name=output_file_name,
)
metadata._experiment_tracker.reset()
elif not self._experiment and global_experiment_name:
eval_result = self._evaluate_with_experiment(
model=model,
prompt_template=prompt_template,
experiment_run_name=experiment_run_name,
evaluation_service_qps=evaluation_service_qps,
retry_timeout=retry_timeout,
output_file_name=output_file_name,
)
else:
eval_result = _evaluation.evaluate(
dataset=self.dataset,
metrics=self.metrics,
model=model,
prompt_template=prompt_template,
metric_column_mapping=self._metric_column_mapping,
evaluation_service_qps=evaluation_service_qps,
retry_timeout=retry_timeout,
)
candidate_model_name = None
if isinstance(model, generative_models.GenerativeModel):
candidate_model_name = model._model_name
baseline_model_name = None
pairwise_metrics = [
metric
for metric in self.metrics
if isinstance(metric, pairwise_metric.PairwiseMetric)
]
if pairwise_metrics:
# All pairwise metrics should have the same baseline model.
baseline_model = pairwise_metrics[0].baseline_model
if isinstance(baseline_model, generative_models.GenerativeModel):
baseline_model_name = baseline_model._model_name
dataset_uri = None
if isinstance(self._raw_dataset, str):
dataset_uri = self._raw_dataset
eval_utils.upload_evaluation_results(
eval_result,
self.output_uri_prefix,
output_file_name,
candidate_model_name,
baseline_model_name,
dataset_uri,
self.metrics,
)
return eval_result
def _validate_experiment_run(self) -> None:
"""Checks if an experiment run already exists."""
if metadata._experiment_tracker.experiment_run:
raise ValueError(
"Experiment run already exists. Please specify the name of the"
" experiment run to assign current session within this evaluation."
)
def _log_eval_experiment_param(
self,
model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None,
prompt_template: Optional[str] = None,
output_file_name: Optional[str] = None,
) -> None:
"""Logs variable input parameters of an evaluation to an experiment run."""
eval_metadata = {}
if prompt_template is not None:
eval_metadata.update({"prompt_template": prompt_template})
if isinstance(model, GenerativeModel):
eval_metadata.update(
{
"model_name": model._model_name,
}
)
if model._generation_config and isinstance(model._generation_config, dict):
eval_metadata.update(**model._generation_config)
if model._safety_settings and isinstance(model._safety_settings, dict):
safety_settings = model._safety_settings
safety_settings_as_str = {
category.name: threshold.name
for category, threshold in safety_settings.items()
}
eval_metadata.update(safety_settings_as_str)
if self.output_uri_prefix and output_file_name:
eval_metadata.update(
{"output_file": self.output_uri_prefix + "/" + output_file_name}
)
if eval_metadata:
_LOGGER.info(f"Logging Eval Experiment metadata: {eval_metadata}")
try:
vertexai.preview.log_params(eval_metadata)
except (ValueError, TypeError) as e:
_LOGGER.warning(f"Experiment metadata logging failed: {str(e)}")
def _verify_and_set_response_column_name(
self, response_column_name: str, metric_column_mapping_key: str
) -> None:
"""Verifies and sets the model response column names."""
if response_column_name:
if response_column_name in self._dataset.columns:
self._metric_column_mapping[
metric_column_mapping_key
] = response_column_name
else:
raise ValueError(
f"(Baseline) Model response column {response_column_name} is not"
" found in the dataset."
)
def display_runs(self):
"""Displays experiment runs associated with this EvalTask."""
if not self._experiment:
raise ValueError("Experiment is not set.")
elif IPython_display:
IPython_display.display(
vertexai.preview.get_experiment_df(self._experiment)
)