981 lines
38 KiB
Python
981 lines
38 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2024 Google LLC
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
"""Evaluation Orchestration Library."""
|
|
|
|
import collections
|
|
from concurrent import futures
|
|
import copy
|
|
import time
|
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING, Union
|
|
|
|
from google.cloud.aiplatform import base
|
|
from google.cloud.aiplatform_v1beta1.types import (
|
|
content as gapic_content_types,
|
|
)
|
|
from vertexai import generative_models
|
|
from vertexai.evaluation import _base as evaluation_base
|
|
from vertexai.evaluation import constants
|
|
from vertexai.evaluation import (
|
|
prompt_template as prompt_template_base,
|
|
)
|
|
from vertexai.evaluation import utils
|
|
from vertexai.evaluation.metrics import (
|
|
_base as metrics_base,
|
|
)
|
|
from vertexai.evaluation.metrics import (
|
|
_instance_evaluation,
|
|
)
|
|
from vertexai.evaluation.metrics import (
|
|
metric_prompt_template_examples,
|
|
)
|
|
from vertexai.evaluation.metrics import pairwise_metric
|
|
from vertexai.evaluation.metrics import pointwise_metric
|
|
|
|
|
|
try:
|
|
from tqdm import tqdm
|
|
except ImportError:
|
|
raise ImportError(
|
|
'tqdm is not installed. Please install the SDK using "pip install'
|
|
' google-cloud-aiplatform[evaluation]"'
|
|
)
|
|
|
|
if TYPE_CHECKING:
|
|
import pandas as pd
|
|
|
|
_LOGGER = base.Logger(__name__)
|
|
_SUCCESSFUL_FINISH_REASONS = [
|
|
gapic_content_types.Candidate.FinishReason.STOP,
|
|
gapic_content_types.Candidate.FinishReason.MAX_TOKENS,
|
|
# Many responses have this finish reason
|
|
gapic_content_types.Candidate.FinishReason.FINISH_REASON_UNSPECIFIED,
|
|
]
|
|
|
|
|
|
def _validate_metrics(metrics: List[Union[str, metrics_base._Metric]]) -> None:
|
|
"""Validates the metrics list.
|
|
|
|
Args:
|
|
metrics: The list of metric names, or Metric instances to
|
|
evaluate.
|
|
|
|
Raises:
|
|
ValueError: If metric is empty or if multiple metrics of the
|
|
same metric name are found.
|
|
"""
|
|
if not metrics:
|
|
raise ValueError("Metrics cannot be empty.")
|
|
|
|
seen_strings = set()
|
|
seen_metric_names = set()
|
|
|
|
for metric in metrics:
|
|
if isinstance(metric, str):
|
|
if metric in seen_strings:
|
|
raise ValueError(f"Duplicate string metric name found: '{metric}'")
|
|
seen_strings.add(metric)
|
|
elif isinstance(metric, metrics_base._Metric):
|
|
if metric.metric_name in seen_metric_names:
|
|
raise ValueError(
|
|
"Duplicate Metric instances of the same metric name found: "
|
|
f"'{metric.metric_name}'"
|
|
)
|
|
seen_metric_names.add(metric.metric_name)
|
|
|
|
|
|
def _validate_metric_column_map(
|
|
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
|
):
|
|
"""Validates the column map for metric prompt template usage."""
|
|
for metric in evaluation_run_config.metrics:
|
|
if isinstance(
|
|
metric, metrics_base._ModelBasedMetric # pylint: disable=protected-access
|
|
):
|
|
for variable in prompt_template_base.PromptTemplate(
|
|
metric.metric_prompt_template
|
|
).variables:
|
|
if (
|
|
evaluation_run_config.metric_column_mapping.get(variable, "")
|
|
not in evaluation_run_config.dataset.columns
|
|
):
|
|
raise ValueError(
|
|
f"Cannot find the `{variable}` column in the evaluation"
|
|
" dataset to fill the metric prompt template for"
|
|
f" `{str(metric)}` metric. Please check if the column is"
|
|
" present in the evaluation dataset, or provide a"
|
|
" key-value pair in `metric_column_mapping` parameter"
|
|
" of `EvalTask` to map it to a different column name."
|
|
" The evaluation dataset columns are"
|
|
f" {list(evaluation_run_config.dataset.columns)}."
|
|
)
|
|
|
|
|
|
def _validate_dataset(
|
|
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
|
) -> None:
|
|
"""Validates the required columns exists in the dataset."""
|
|
_validate_response_column_required(evaluation_run_config)
|
|
_validate_reference_column_required(evaluation_run_config)
|
|
_validate_reference_or_source_column_required(evaluation_run_config)
|
|
|
|
|
|
def _validate_response_column_required(
|
|
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
|
) -> None:
|
|
"""Validates the response column exists in the dataset."""
|
|
for metric in evaluation_run_config.metrics:
|
|
if metric in constants.Metric.AUTOMATIC_METRIC_LIST or isinstance(
|
|
metric, metrics_base._TranslationMetric # pylint: disable=protected-access
|
|
):
|
|
_validate_column_provided(
|
|
evaluation_run_config,
|
|
constants.Dataset.MODEL_RESPONSE_COLUMN,
|
|
)
|
|
|
|
|
|
def _validate_reference_column_required(
|
|
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
|
) -> None:
|
|
"""Validates the reference column exists in the dataset."""
|
|
if set(evaluation_run_config.metrics).intersection(
|
|
set(constants.Metric.AUTOMATIC_METRIC_LIST)
|
|
):
|
|
_validate_column_provided(
|
|
evaluation_run_config,
|
|
constants.Dataset.REFERENCE_COLUMN,
|
|
)
|
|
|
|
|
|
def _validate_column_provided(
|
|
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
|
column_name: str,
|
|
) -> None:
|
|
"""Validates the required column exist in the dataset."""
|
|
if column_name not in evaluation_run_config.metric_column_mapping:
|
|
evaluation_run_config.metric_column_mapping[column_name] = column_name
|
|
evaluation_run_config.validate_dataset_column(column_name)
|
|
|
|
|
|
def _validate_reference_or_source_column_required(
|
|
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
|
) -> None:
|
|
"""Validates one of reference or source columns exist in the dataset."""
|
|
for metric in evaluation_run_config.metrics:
|
|
if isinstance(
|
|
metric, metrics_base._TranslationMetric # pylint: disable=protected-access
|
|
):
|
|
# Validate the reference column.
|
|
# This is optional if source column is provided.
|
|
try:
|
|
_validate_column_provided(
|
|
evaluation_run_config,
|
|
constants.Dataset.REFERENCE_COLUMN,
|
|
)
|
|
except KeyError:
|
|
# Reference column is optional. Checking for source column.
|
|
_validate_column_provided(
|
|
evaluation_run_config,
|
|
constants.Dataset.SOURCE_COLUMN,
|
|
)
|
|
|
|
|
|
def _compute_custom_metrics(
|
|
row_dict: Dict[str, Any],
|
|
custom_metrics: List[metrics_base.CustomMetric],
|
|
pbar: tqdm,
|
|
executor: futures.ThreadPoolExecutor,
|
|
) -> Dict[str, Any]:
|
|
"""Computes custom metrics for a row.
|
|
|
|
Args:
|
|
row_dict: A dictionary of an instance in the eval dataset.
|
|
custom_metrics: A list of CustomMetrics.
|
|
pbar: A tqdm progress bar.
|
|
executor: A thread pool executor.
|
|
|
|
Returns:
|
|
A dictionary of an instance containing custom metric results.
|
|
|
|
Raises:
|
|
KeyError: If the custom metric function does not return a valid output.
|
|
"""
|
|
futures_by_metric = collections.defaultdict(list)
|
|
for custom_metric in custom_metrics:
|
|
future = executor.submit(custom_metric.metric_function, row_dict)
|
|
future.add_done_callback(lambda _: pbar.update(1))
|
|
futures_by_metric[custom_metric].append(future)
|
|
|
|
for custom_metric, futures_list in futures_by_metric.items():
|
|
for future in futures_list:
|
|
metric_output = future.result()
|
|
try:
|
|
row_dict[
|
|
f"{custom_metric.name}/{constants.MetricResult.SCORE_KEY}"
|
|
] = metric_output[custom_metric.name]
|
|
except KeyError:
|
|
raise KeyError(
|
|
f"Custom metric score `{custom_metric.name}` not found in"
|
|
f" the metric output {metric_output}. Please make sure the"
|
|
" custom metric function is valid, and the output"
|
|
f" dictionary uses `{custom_metric.name}` as the key for"
|
|
" metric score."
|
|
)
|
|
# Include additional metric results like explanation.
|
|
for key, value in metric_output.items():
|
|
if key != custom_metric.name:
|
|
row_dict[f"{custom_metric.name}/{key}"] = value
|
|
return row_dict
|
|
|
|
|
|
def _separate_custom_metrics(
|
|
metrics: List[Union[str, metrics_base._Metric]],
|
|
) -> Tuple[List[Union[str, metrics_base._Metric]], List[metrics_base.CustomMetric],]:
|
|
"""Separates the metrics list into API and custom metrics."""
|
|
custom_metrics = []
|
|
api_metrics = []
|
|
for metric in metrics:
|
|
if isinstance(metric, metrics_base.CustomMetric):
|
|
custom_metrics.append(metric)
|
|
else:
|
|
api_metrics.append(metric)
|
|
return api_metrics, custom_metrics
|
|
|
|
|
|
def _aggregate_summary_metrics(
|
|
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
|
metrics_table: "pd.DataFrame",
|
|
) -> Dict[str, Any]:
|
|
"""Computes summary metrics.
|
|
|
|
Args:
|
|
evaluation_run_config: Evaluation Run Configurations.
|
|
metrics_table: A dataframe containing per-instance metrics results.
|
|
|
|
Returns:
|
|
A dictionary containing summary metrics results and statistics.
|
|
"""
|
|
summary_metrics = {}
|
|
summary_metrics[constants.MetricResult.ROW_COUNT_KEY] = metrics_table.shape[0]
|
|
|
|
for metric in evaluation_run_config.metrics:
|
|
try:
|
|
if isinstance(metric, pairwise_metric.PairwiseMetric):
|
|
summary_metrics[f"{metric.metric_name}/candidate_model_win_rate"] = (
|
|
metrics_table[
|
|
f"{metric.metric_name}/{constants.MetricResult.PAIRWISE_CHOICE_KEY}"
|
|
]
|
|
== "CANDIDATE"
|
|
).mean()
|
|
summary_metrics[f"{metric.metric_name}/baseline_model_win_rate"] = (
|
|
metrics_table[
|
|
f"{metric.metric_name}/{constants.MetricResult.PAIRWISE_CHOICE_KEY}"
|
|
]
|
|
== "BASELINE"
|
|
).mean()
|
|
else:
|
|
summary_metrics[f"{str(metric)}/mean"] = metrics_table.loc[
|
|
:, f"{str(metric)}/{constants.MetricResult.SCORE_KEY}"
|
|
].mean()
|
|
summary_metrics[f"{str(metric)}/std"] = metrics_table.loc[
|
|
:, f"{str(metric)}/{constants.MetricResult.SCORE_KEY}"
|
|
].std()
|
|
except (ValueError, KeyError) as e:
|
|
_LOGGER.warning(
|
|
f"Failed to compute metric statistics for `{metric}` metric."
|
|
f"{type(e).__name__}: {e}"
|
|
)
|
|
continue
|
|
return summary_metrics
|
|
|
|
|
|
def _generate_content_text_response(
|
|
model: generative_models.GenerativeModel, prompt: str, max_retries: int = 3
|
|
) -> str:
|
|
"""Generates a text response from Gemini model from a text prompt with retries.
|
|
|
|
Args:
|
|
model: The Gemini model instance.
|
|
prompt: The prompt to send to the model.
|
|
max_retries: Maximum number of retries for response generation.
|
|
|
|
Returns:
|
|
The text response from the model.
|
|
Returns constants.RESPONSE_ERROR if there is an error after all retries.
|
|
"""
|
|
for retry_attempt in range(max_retries):
|
|
try:
|
|
response = model.generate_content(prompt)
|
|
if not response.candidates:
|
|
error_message = (
|
|
f"The model response was blocked due to"
|
|
f" {response._raw_response.prompt_feedback.block_reason.name}.\n"
|
|
f"Blocked reason message:"
|
|
f" {response._raw_response.prompt_feedback.block_reason_message}.\n"
|
|
"The input prompt may be blocked for safety reasons.\n"
|
|
f"Prompt: {prompt}.\n"
|
|
f"Retry attempt: {retry_attempt + 1}/{max_retries}"
|
|
)
|
|
_LOGGER.warning(error_message)
|
|
break
|
|
else:
|
|
candidate = response.candidates[0]
|
|
if candidate.finish_reason not in _SUCCESSFUL_FINISH_REASONS:
|
|
error_message = (
|
|
"The model response did not finish"
|
|
" successfully.\n"
|
|
f"Finish reason: {candidate.finish_reason}.\n"
|
|
f"Finish message: {candidate.finish_message}.\n"
|
|
f"Safety ratings: {candidate.safety_ratings}.\n"
|
|
"Please adjust the model safety_settings, or"
|
|
" try a different prompt.\n"
|
|
f"Retry attempt: {retry_attempt + 1}/{max_retries}"
|
|
)
|
|
_LOGGER.warning(error_message)
|
|
else:
|
|
return response.candidates[0].content.parts[0].text
|
|
except Exception as e:
|
|
error_message = (
|
|
f"Failed to generate response candidates from Gemini model"
|
|
f" {model._model_name}.\n"
|
|
f"Error: {e}.\n"
|
|
f"Prompt: {prompt}.\n"
|
|
f"Retry attempt: {retry_attempt + 1}/{max_retries}"
|
|
)
|
|
_LOGGER.warning(error_message)
|
|
if retry_attempt < max_retries - 1:
|
|
_LOGGER.info(
|
|
f"Retrying response generation for prompt: {prompt}, attempt"
|
|
f" {retry_attempt + 1}/{max_retries}..."
|
|
)
|
|
|
|
final_error_message = (
|
|
f"Failed to generate response from Gemini model {model._model_name}.\n"
|
|
f"Prompt: {prompt}."
|
|
)
|
|
_LOGGER.warning(final_error_message)
|
|
return constants.RESPONSE_ERROR
|
|
|
|
|
|
def _generate_responses_from_gemini_model(
|
|
model: generative_models.GenerativeModel,
|
|
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
|
is_baseline_model: bool = False,
|
|
) -> None:
|
|
"""Generates responses from Gemini model.
|
|
|
|
Args:
|
|
model: The Gemini model instance.
|
|
evaluation_run_config: Evaluation Run Configurations.
|
|
is_baseline_model: Whether the model is a baseline model for PairwiseMetric.
|
|
"""
|
|
# Ensure thread safety and avoid race conditions.
|
|
df = evaluation_run_config.dataset.copy()
|
|
|
|
_LOGGER.info(
|
|
f"Generating a total of {evaluation_run_config.dataset.shape[0]} "
|
|
f"responses from Gemini model {model._model_name.split('/')[-1]}."
|
|
)
|
|
tasks = []
|
|
with tqdm(total=len(df)) as pbar:
|
|
with futures.ThreadPoolExecutor(max_workers=constants.MAX_WORKERS) as executor:
|
|
for _, row in df.iterrows():
|
|
task = executor.submit(
|
|
_generate_content_text_response,
|
|
prompt=row[constants.Dataset.PROMPT_COLUMN],
|
|
model=model,
|
|
)
|
|
task.add_done_callback(lambda _: pbar.update(1))
|
|
tasks.append(task)
|
|
responses = [future.result() for future in tasks]
|
|
if is_baseline_model:
|
|
evaluation_run_config.dataset = df.assign(baseline_model_response=responses)
|
|
else:
|
|
evaluation_run_config.dataset = df.assign(response=responses)
|
|
|
|
_LOGGER.info(
|
|
f"All {evaluation_run_config.dataset.shape[0]} responses are successfully"
|
|
f" generated from Gemini model {model._model_name.split('/')[-1]}."
|
|
)
|
|
|
|
|
|
def _generate_response_from_custom_model_fn(
|
|
model_fn: Callable[[str], str],
|
|
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
|
is_baseline_model: bool = False,
|
|
) -> None:
|
|
"""Generates responses from a custom model function.
|
|
|
|
Args:
|
|
model_fn: The custom model function.
|
|
evaluation_run_config: Evaluation Run Configurations.
|
|
is_baseline_model: Whether the model is a baseline model for
|
|
PairwiseMetric.
|
|
"""
|
|
eval_dataset = evaluation_run_config.dataset.copy()
|
|
max_workers = 5
|
|
|
|
_LOGGER.info(
|
|
f"Generating a total of {evaluation_run_config.dataset.shape[0]} "
|
|
"responses from the custom model function."
|
|
)
|
|
tasks = []
|
|
try:
|
|
with tqdm(total=len(eval_dataset)) as pbar:
|
|
with futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
for _, row in eval_dataset.iterrows():
|
|
task = executor.submit(
|
|
model_fn, row[constants.Dataset.PROMPT_COLUMN]
|
|
)
|
|
task.add_done_callback(lambda _: pbar.update(1))
|
|
tasks.append(task)
|
|
except (ValueError, IndexError) as e:
|
|
_LOGGER.warning(f"Failed to generate response from model function: {e}")
|
|
|
|
responses = [task.result() for task in tasks]
|
|
if is_baseline_model:
|
|
evaluation_run_config.dataset = eval_dataset.assign(
|
|
baseline_model_response=responses
|
|
)
|
|
else:
|
|
evaluation_run_config.dataset = eval_dataset.assign(response=responses)
|
|
|
|
_LOGGER.info(
|
|
f"All {evaluation_run_config.dataset.shape[0]} responses are successfully"
|
|
" generated from the custom model function."
|
|
)
|
|
|
|
|
|
def _run_model_inference(
|
|
model: Union[generative_models.GenerativeModel, Callable[[str], str]],
|
|
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
|
response_column_name: str = constants.Dataset.MODEL_RESPONSE_COLUMN,
|
|
) -> None:
|
|
"""Runs model inference on dataset for evaluation.
|
|
|
|
Args:
|
|
model: The model or baseline model or a custom model function to
|
|
generate responses to evaluate.
|
|
evaluation_run_config: Evaluation Run Configurations.
|
|
response_column_name: Column name key in metric_column_mapping. Value is
|
|
constants.Dataset.MODEL_RESPONSE_COLUMN or
|
|
constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN.
|
|
|
|
Raises:
|
|
ValueError: If the model or baseline model is not supported.
|
|
"""
|
|
is_baseline_model = (
|
|
response_column_name == constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN
|
|
)
|
|
if model:
|
|
if response_column_name not in evaluation_run_config.metric_column_mapping:
|
|
if constants.Dataset.PROMPT_COLUMN in evaluation_run_config.dataset.columns:
|
|
t1 = time.perf_counter()
|
|
if isinstance(model, generative_models.GenerativeModel):
|
|
_generate_responses_from_gemini_model(
|
|
model, evaluation_run_config, is_baseline_model
|
|
)
|
|
elif callable(model):
|
|
_generate_response_from_custom_model_fn(
|
|
model, evaluation_run_config, is_baseline_model
|
|
)
|
|
else:
|
|
raise ValueError(
|
|
f"Unsupported model or baseline model type: {type(model)}"
|
|
)
|
|
t2 = time.perf_counter()
|
|
_LOGGER.info(f"Multithreaded Batch Inference took: {t2 - t1} seconds.")
|
|
evaluation_run_config.metric_column_mapping[
|
|
response_column_name
|
|
] = response_column_name
|
|
else:
|
|
raise ValueError(
|
|
"Missing required input `prompt` column to start model inference."
|
|
" Please provide a `prompt_template` parameter in"
|
|
" `EvalTask.evaluate()` function if you want to assemble a"
|
|
" `prompt` column with variables from the dataset, or provide a"
|
|
" `prompt` column in dataset to directly use as input to"
|
|
" the model. Mappings in `metric_column_mapping` do not"
|
|
" apply for model inference and are used for evaluation only."
|
|
)
|
|
else:
|
|
raise ValueError(
|
|
"The `model` parameter or `baseline_model` in pairwise metric is"
|
|
" specified, but the evaluation `dataset` contains model response"
|
|
" column or baseline model response column"
|
|
f" `{evaluation_run_config.metric_column_mapping[response_column_name]}`"
|
|
" to perform bring-your-own-response(BYOR) evaluation. If you would"
|
|
" like to perform evaluation using the dataset with the"
|
|
" existing model response column or or baseline model response column"
|
|
f" `{evaluation_run_config.metric_column_mapping[response_column_name]}`,"
|
|
" please remove `model` parameter in `EvalTask.evaluate()`"
|
|
" function or `baseline_model` in `PairwiseMetric`."
|
|
)
|
|
|
|
|
|
def _check_variable_columns_exist(
|
|
dataset: "pd.DataFrame", variable_names_set: Set[str]
|
|
) -> None:
|
|
"""Checks if all variable names exist in the dataset columns.
|
|
|
|
Args:
|
|
dataset: The dataset to evaluate.
|
|
variable_names_set: A set of variable names.
|
|
|
|
Raises:
|
|
ValueError: If any variable names do not exist in the dataset columns
|
|
or the prompt template is invalid.
|
|
"""
|
|
actual_column_names_set = set(dataset.columns)
|
|
if not variable_names_set.issubset(actual_column_names_set):
|
|
missing_columns = variable_names_set - actual_column_names_set
|
|
raise ValueError(
|
|
"Failed to assemble prompt template: The following column(s) are"
|
|
f" missing: {', '.join(missing_columns)}. "
|
|
f"Please verify prompt_template variables {variable_names_set} and "
|
|
f"evaluation dataset column names {actual_column_names_set}."
|
|
)
|
|
|
|
|
|
def _assemble_prompt_for_dataset(
|
|
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
|
prompt_template: Union[prompt_template_base.PromptTemplate, str],
|
|
) -> None:
|
|
"""Assembles a prompt column in metrics_table from variable columns.
|
|
|
|
Args:
|
|
evaluation_run_config: Evaluation Run Configurations.
|
|
prompt_template: A `PromptTemplate` object or a prompt template string
|
|
with variables that can be assembled from the evaluation dataset. The
|
|
variables can be represented in curly braces `{variable}`, and
|
|
must be included in the dataset columns if specified. The variable
|
|
names cannot contain spaces.
|
|
|
|
Returns:
|
|
The assembled prompt template string to send to the model.
|
|
|
|
Raises:
|
|
ValueError: If any variable names do not exist in the dataset columns
|
|
or the prompt template is invalid.
|
|
"""
|
|
if not prompt_template:
|
|
raise ValueError("Prompt template cannot be an empty string.")
|
|
|
|
_LOGGER.info(
|
|
"Assembling prompts from the `prompt_template`. The `prompt` column in"
|
|
" the `EvalResult.metrics_table` has the assembled prompts used for model"
|
|
" response generation."
|
|
)
|
|
if isinstance(prompt_template, str):
|
|
prompt_template = prompt_template_base.PromptTemplate(prompt_template)
|
|
_check_variable_columns_exist(
|
|
evaluation_run_config.dataset, prompt_template.variables
|
|
)
|
|
|
|
try:
|
|
evaluation_run_config.dataset[
|
|
constants.Dataset.PROMPT_COLUMN
|
|
] = evaluation_run_config.dataset.apply(
|
|
lambda row: str(
|
|
prompt_template.assemble(
|
|
**row[list(prompt_template.variables)].astype(str).to_dict(),
|
|
)
|
|
),
|
|
axis=1,
|
|
)
|
|
if (
|
|
constants.Dataset.PROMPT_COLUMN
|
|
in evaluation_run_config.metric_column_mapping
|
|
and evaluation_run_config.metric_column_mapping[
|
|
constants.Dataset.PROMPT_COLUMN
|
|
]
|
|
!= constants.Dataset.PROMPT_COLUMN
|
|
):
|
|
_LOGGER.warning(
|
|
"The `prompt` column mapping provided in"
|
|
" `metric_column_mapping` parameter is overwritten by the"
|
|
" assembled `prompt` column because the `prompt_template`"
|
|
" parameter is provided. Please verify that you want to use"
|
|
" the assembled `prompt` column for evaluation."
|
|
)
|
|
evaluation_run_config.metric_column_mapping[
|
|
constants.Dataset.PROMPT_COLUMN
|
|
] = constants.Dataset.PROMPT_COLUMN
|
|
except Exception as e:
|
|
raise ValueError(
|
|
f"Failed to assemble prompt template: {e}. Please make sure all"
|
|
" variables in `prompt_template` are present in the evaluation"
|
|
f" dataset columns: `{list(evaluation_run_config.dataset.columns)}`."
|
|
) from e
|
|
|
|
|
|
def _set_metric_table(
|
|
metric_name: str,
|
|
metric_results: Any,
|
|
metrics_table: "pd.DataFrame",
|
|
metric_result_key: str,
|
|
):
|
|
"""Parses value from metric results to metrics_table."""
|
|
if metric_result_key == constants.MetricResult.SCORE_KEY:
|
|
metric_result_items = [
|
|
result.get(metric_result_key) if isinstance(result, dict) else None
|
|
for result in metric_results
|
|
]
|
|
else:
|
|
metric_result_items = [
|
|
result.get(metric_result_key) if isinstance(result, dict) else "Error"
|
|
for result in metric_results
|
|
]
|
|
metrics_table[f"{metric_name}/{metric_result_key}"] = metric_result_items
|
|
|
|
|
|
def _parse_metric_results_to_dataframe(
|
|
instance_df: "pd.DataFrame", results: Dict[Union[str, metrics_base._Metric], Any]
|
|
) -> Dict[str, Any]:
|
|
"""Parses metric results to a pandas dataframe.
|
|
|
|
Args:
|
|
instance_df: A dataframe containing per-instance metrics results.
|
|
results: A dictionary containing metric results.
|
|
|
|
Returns:
|
|
A dataframe containing per-instance metrics results. Each metric result
|
|
can contain metric score, explanation, and confidence.
|
|
"""
|
|
try:
|
|
import pandas as pd
|
|
except ImportError:
|
|
raise ImportError(
|
|
'Pandas is not installed. Please install the SDK using "pip install'
|
|
' google-cloud-aiplatform[evaluation]"'
|
|
)
|
|
|
|
metrics_table = pd.DataFrame(dict(zip(instance_df.columns, instance_df.values.T)))
|
|
for metric, metric_results in results.items():
|
|
if isinstance(metric, pointwise_metric.PointwiseMetric):
|
|
_set_metric_table(
|
|
metric.metric_name,
|
|
metric_results,
|
|
metrics_table,
|
|
constants.MetricResult.EXPLANATION_KEY,
|
|
)
|
|
_set_metric_table(
|
|
metric.metric_name,
|
|
metric_results,
|
|
metrics_table,
|
|
constants.MetricResult.SCORE_KEY,
|
|
)
|
|
elif isinstance(metric, pairwise_metric.PairwiseMetric):
|
|
_set_metric_table(
|
|
metric.metric_name,
|
|
metric_results,
|
|
metrics_table,
|
|
constants.MetricResult.EXPLANATION_KEY,
|
|
)
|
|
_set_metric_table(
|
|
metric.metric_name,
|
|
metric_results,
|
|
metrics_table,
|
|
constants.MetricResult.PAIRWISE_CHOICE_KEY,
|
|
)
|
|
elif str(metric) in constants.Metric.AUTOMATIC_METRIC_LIST:
|
|
_set_metric_table(
|
|
str(metric),
|
|
metric_results,
|
|
metrics_table,
|
|
constants.MetricResult.SCORE_KEY,
|
|
)
|
|
elif isinstance(
|
|
metric, metrics_base._TranslationMetric # pylint: disable=protected-access
|
|
):
|
|
_set_metric_table(
|
|
str(metric),
|
|
metric_results,
|
|
metrics_table,
|
|
constants.MetricResult.SCORE_KEY,
|
|
)
|
|
else:
|
|
_LOGGER.warning(
|
|
f"Metric name: {str(metric)} is not supported when parsing"
|
|
" metric results."
|
|
)
|
|
|
|
return metrics_table
|
|
|
|
|
|
def _compute_metrics(
|
|
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
|
) -> Tuple[Dict[str, Any], "pd.DataFrame"]:
|
|
"""Computes the metrics for the dataset.
|
|
|
|
Args:
|
|
evaluation_run_config: Evaluation Run Configurations.
|
|
|
|
Returns:
|
|
The evaluation results for the input metrics.
|
|
|
|
Raises:
|
|
RuntimeError: The number of responses does not match the number of metrics.
|
|
"""
|
|
try:
|
|
import pandas as pd
|
|
except ImportError:
|
|
raise ImportError(
|
|
'Pandas is not installed. Please install the SDK using "pip install'
|
|
' google-cloud-aiplatform[evaluation]"'
|
|
)
|
|
|
|
api_metrics, custom_metrics = _separate_custom_metrics(
|
|
evaluation_run_config.metrics
|
|
)
|
|
row_count = len(evaluation_run_config.dataset)
|
|
api_request_count = len(api_metrics) * row_count
|
|
custom_metric_request_count = len(custom_metrics) * row_count
|
|
total_request_count = api_request_count + custom_metric_request_count
|
|
|
|
_LOGGER.info(
|
|
f"Computing metrics with a total of {total_request_count} Vertex Gen AI"
|
|
" Evaluation Service API requests."
|
|
)
|
|
|
|
instance_list = []
|
|
futures_by_metric = collections.defaultdict(list)
|
|
rate_limiter = utils.RateLimiter(evaluation_run_config.evaluation_service_qps)
|
|
with tqdm(total=total_request_count) as pbar:
|
|
with futures.ThreadPoolExecutor(max_workers=constants.MAX_WORKERS) as executor:
|
|
for idx, row in evaluation_run_config.dataset.iterrows():
|
|
row_dict = _compute_custom_metrics(
|
|
row.to_dict(), custom_metrics, pbar, executor
|
|
)
|
|
instance_list.append(row_dict)
|
|
for metric in api_metrics:
|
|
future = executor.submit(
|
|
_instance_evaluation.evaluate_instances,
|
|
client=evaluation_run_config.client,
|
|
request=_instance_evaluation.build_request(
|
|
metric=metric,
|
|
row_dict=row_dict,
|
|
evaluation_run_config=evaluation_run_config,
|
|
),
|
|
rate_limiter=rate_limiter,
|
|
retry_timeout=evaluation_run_config.retry_timeout,
|
|
)
|
|
future.add_done_callback(lambda _: pbar.update(1))
|
|
futures_by_metric[metric].append((future, idx))
|
|
|
|
# Retrieve results from all futures and handle errors.
|
|
results_dict = collections.defaultdict(list)
|
|
error_list = []
|
|
for metric, futures_list in futures_by_metric.items():
|
|
for future, index in futures_list:
|
|
try:
|
|
response = future.result()
|
|
results_dict[metric].append(response)
|
|
except Exception as e:
|
|
results_dict[metric].append("Error")
|
|
error_list.append((metric, index, f"Error: {e}"))
|
|
|
|
for metric, responses in results_dict.items():
|
|
results_dict[metric] = [
|
|
_instance_evaluation.handle_response(response) for response in responses
|
|
]
|
|
if error_list:
|
|
_LOGGER.warning(
|
|
f"{len(error_list)} errors encountered during evaluation. Continue to"
|
|
" compute summary metrics for the rest of the dataset."
|
|
)
|
|
for metric_name, index, error in error_list:
|
|
_LOGGER.warning(
|
|
f"Error encountered for metric {metric_name} at dataset index"
|
|
f" {index}: {error}"
|
|
)
|
|
else:
|
|
_LOGGER.info(
|
|
f"All {total_request_count} metric requests are successfully computed."
|
|
)
|
|
|
|
instance_df = pd.DataFrame.from_dict(instance_list)
|
|
metrics_table = _parse_metric_results_to_dataframe(instance_df, results_dict)
|
|
|
|
# Aggregate the summary metrics.
|
|
summary_metrics = _aggregate_summary_metrics(evaluation_run_config, metrics_table)
|
|
|
|
return evaluation_base.EvalResult(
|
|
summary_metrics=summary_metrics, metrics_table=metrics_table
|
|
)
|
|
|
|
|
|
def _get_baseline_model(evaluation_run_config: evaluation_base.EvaluationRunConfig):
|
|
"""Gets the baseline model from the pairwise metrics."""
|
|
pairwise_metric_instances = [
|
|
metric
|
|
for metric in evaluation_run_config.metrics
|
|
if isinstance(metric, pairwise_metric.PairwiseMetric)
|
|
]
|
|
baseline_models = {
|
|
instance.metric_name: instance.baseline_model
|
|
for instance in pairwise_metric_instances
|
|
}
|
|
if len(set(baseline_models.values())) > 1:
|
|
raise ValueError(
|
|
"Not all `PairwiseMetric` instances have the same `baseline_model`. "
|
|
f"Here are the detected baseline models: `{baseline_models}`. "
|
|
"Please separate pairwise metrics with different baseline models "
|
|
"in different `EvalTask` or use the same baseline model for "
|
|
"all pairwise metrics."
|
|
)
|
|
return pairwise_metric_instances[0].baseline_model
|
|
|
|
|
|
def _convert_metric_prompt_template_example(metrics):
|
|
"""Converts string metric names to generic model-based metric instances."""
|
|
updated_metrics = []
|
|
for metric in metrics:
|
|
if metric in constants.Metric.POINTWISE_METRIC_PROMPT_TEMPLATE_EXAMPLE_LIST:
|
|
template = metric_prompt_template_examples.MetricPromptTemplateExamples.get_prompt_template(
|
|
metric
|
|
)
|
|
metric = pointwise_metric.PointwiseMetric(
|
|
metric=metric, metric_prompt_template=template
|
|
)
|
|
elif metric in constants.Metric.PAIRWISE_METRIC_PROMPT_TEMPLATE_EXAMPLE_LIST:
|
|
template = metric_prompt_template_examples.MetricPromptTemplateExamples.get_prompt_template(
|
|
metric
|
|
)
|
|
metric = pairwise_metric.PairwiseMetric(
|
|
metric=metric, metric_prompt_template=template
|
|
)
|
|
_LOGGER.info(
|
|
f"Pairwise metric `{metric.metric_name}` loaded from"
|
|
" `MetricPromptTemplateExamples` does not have `baseline_model`"
|
|
" specified and only supports Bring-Your-Own-Response(BYOR)"
|
|
" evaluation. If you would like to run inference on the baseline model,"
|
|
" please instantiate a `PairwiseMetric` and provide the"
|
|
" `baseline_model` parameter."
|
|
)
|
|
updated_metrics.append(metric)
|
|
return updated_metrics
|
|
|
|
|
|
def evaluate(
|
|
dataset: "pd.DataFrame",
|
|
metrics: List[Union[str, metrics_base._Metric]],
|
|
*,
|
|
model: Optional[
|
|
Union[generative_models.GenerativeModel, Callable[[str], str]]
|
|
] = None,
|
|
prompt_template: Optional[Union[str, prompt_template_base.PromptTemplate]] = None,
|
|
metric_column_mapping: Dict[str, str],
|
|
evaluation_service_qps: Optional[float] = None,
|
|
retry_timeout: float = 600.0,
|
|
) -> evaluation_base.EvalResult:
|
|
"""Runs the evaluation for metrics.
|
|
|
|
Args:
|
|
dataset: The dataset to evaluate.
|
|
metrics: The list of metric names, or Metric instances to
|
|
evaluate. Prompt template is required for PairwiseMetric.
|
|
model: The GenerativeModel instance or a custom model function to generate
|
|
responses to evaluate. If not provided, the evaluation is computed with
|
|
the `response` column in the `dataset`.
|
|
prompt_template: A `PromptTemplate` or a prompt template string compatible
|
|
with `PromptTemplate` class with variables that can be formatted with
|
|
dataset columns to create assembled prompts. The variables can be
|
|
represented in curly braces `{variable_name}`, and must be included in the
|
|
dataset columns if specified. The variable names cannot contain spaces.
|
|
metric_column_mapping: An optional dictionary column mapping that
|
|
overrides the metric prompt template input variable names with
|
|
mapped the evaluation dataset column names, used during evaluation.
|
|
For example, if the input_variables of the metric prompt template
|
|
are ["context", "reference"], the metric_column_mapping can be
|
|
{
|
|
"context": "news_context",
|
|
"reference": "ground_truth",
|
|
"response": "model_1_response"
|
|
}
|
|
if the dataset has columns "news_context", "ground_truth" and
|
|
"model_1_response".
|
|
evaluation_service_qps: The custom QPS limit for the evaluation service.
|
|
retry_timeout: How long to keep retrying the evaluation requests for the
|
|
whole evaluation dataset, in seconds.
|
|
|
|
Returns:
|
|
EvalResult with summary metrics and a metrics table for per-instance
|
|
metrics.
|
|
|
|
Raises:
|
|
ValueError: If the metrics list is empty, or the prompt template is not
|
|
provided for PairwiseMetric, or multiple baseline models are specified for
|
|
PairwiseMetric instances, or both model and dataset model response column
|
|
are present.
|
|
"""
|
|
_validate_metrics(metrics)
|
|
metrics = _convert_metric_prompt_template_example(metrics)
|
|
copied_metrics = []
|
|
for metric in metrics:
|
|
if isinstance(metric, pairwise_metric.PairwiseMetric):
|
|
copied_metrics.append(
|
|
pairwise_metric.PairwiseMetric(
|
|
metric=metric.metric_name,
|
|
metric_prompt_template=metric.metric_prompt_template,
|
|
baseline_model=metric.baseline_model,
|
|
)
|
|
)
|
|
else:
|
|
copied_metrics.append(copy.deepcopy(metric))
|
|
evaluation_run_config = evaluation_base.EvaluationRunConfig(
|
|
dataset=dataset.copy(deep=True),
|
|
metrics=copied_metrics,
|
|
metric_column_mapping=copy.deepcopy(metric_column_mapping),
|
|
client=utils.create_evaluation_service_client(),
|
|
evaluation_service_qps=(
|
|
evaluation_service_qps
|
|
if evaluation_service_qps
|
|
else constants.QuotaLimit.EVAL_SERVICE_QPS
|
|
),
|
|
retry_timeout=retry_timeout,
|
|
)
|
|
|
|
if prompt_template:
|
|
_assemble_prompt_for_dataset(evaluation_run_config, prompt_template)
|
|
|
|
_run_model_inference(
|
|
model=model,
|
|
evaluation_run_config=evaluation_run_config,
|
|
response_column_name=constants.Dataset.MODEL_RESPONSE_COLUMN,
|
|
)
|
|
_validate_dataset(evaluation_run_config)
|
|
|
|
pairwise_metric_exists = any(
|
|
isinstance(metric, pairwise_metric.PairwiseMetric)
|
|
for metric in evaluation_run_config.metrics
|
|
)
|
|
if pairwise_metric_exists:
|
|
baseline_model = _get_baseline_model(evaluation_run_config)
|
|
_run_model_inference(
|
|
model=baseline_model,
|
|
evaluation_run_config=evaluation_run_config,
|
|
response_column_name=constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
|
|
)
|
|
|
|
_validate_metric_column_map(evaluation_run_config)
|
|
t1 = time.perf_counter()
|
|
evaluation_result = _compute_metrics(evaluation_run_config)
|
|
t2 = time.perf_counter()
|
|
_LOGGER.info(f"Evaluation Took:{t2 - t1} seconds")
|
|
|
|
return evaluation_result
|