structure saas with tools
This commit is contained in:
@@ -0,0 +1,980 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2024 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""Evaluation Orchestration Library."""
|
||||
|
||||
import collections
|
||||
from concurrent import futures
|
||||
import copy
|
||||
import time
|
||||
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING, Union
|
||||
|
||||
from google.cloud.aiplatform import base
|
||||
from google.cloud.aiplatform_v1beta1.types import (
|
||||
content as gapic_content_types,
|
||||
)
|
||||
from vertexai import generative_models
|
||||
from vertexai.evaluation import _base as evaluation_base
|
||||
from vertexai.evaluation import constants
|
||||
from vertexai.evaluation import (
|
||||
prompt_template as prompt_template_base,
|
||||
)
|
||||
from vertexai.evaluation import utils
|
||||
from vertexai.evaluation.metrics import (
|
||||
_base as metrics_base,
|
||||
)
|
||||
from vertexai.evaluation.metrics import (
|
||||
_instance_evaluation,
|
||||
)
|
||||
from vertexai.evaluation.metrics import (
|
||||
metric_prompt_template_examples,
|
||||
)
|
||||
from vertexai.evaluation.metrics import pairwise_metric
|
||||
from vertexai.evaluation.metrics import pointwise_metric
|
||||
|
||||
|
||||
try:
|
||||
from tqdm import tqdm
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'tqdm is not installed. Please install the SDK using "pip install'
|
||||
' google-cloud-aiplatform[evaluation]"'
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pandas as pd
|
||||
|
||||
_LOGGER = base.Logger(__name__)
|
||||
_SUCCESSFUL_FINISH_REASONS = [
|
||||
gapic_content_types.Candidate.FinishReason.STOP,
|
||||
gapic_content_types.Candidate.FinishReason.MAX_TOKENS,
|
||||
# Many responses have this finish reason
|
||||
gapic_content_types.Candidate.FinishReason.FINISH_REASON_UNSPECIFIED,
|
||||
]
|
||||
|
||||
|
||||
def _validate_metrics(metrics: List[Union[str, metrics_base._Metric]]) -> None:
|
||||
"""Validates the metrics list.
|
||||
|
||||
Args:
|
||||
metrics: The list of metric names, or Metric instances to
|
||||
evaluate.
|
||||
|
||||
Raises:
|
||||
ValueError: If metric is empty or if multiple metrics of the
|
||||
same metric name are found.
|
||||
"""
|
||||
if not metrics:
|
||||
raise ValueError("Metrics cannot be empty.")
|
||||
|
||||
seen_strings = set()
|
||||
seen_metric_names = set()
|
||||
|
||||
for metric in metrics:
|
||||
if isinstance(metric, str):
|
||||
if metric in seen_strings:
|
||||
raise ValueError(f"Duplicate string metric name found: '{metric}'")
|
||||
seen_strings.add(metric)
|
||||
elif isinstance(metric, metrics_base._Metric):
|
||||
if metric.metric_name in seen_metric_names:
|
||||
raise ValueError(
|
||||
"Duplicate Metric instances of the same metric name found: "
|
||||
f"'{metric.metric_name}'"
|
||||
)
|
||||
seen_metric_names.add(metric.metric_name)
|
||||
|
||||
|
||||
def _validate_metric_column_map(
|
||||
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
||||
):
|
||||
"""Validates the column map for metric prompt template usage."""
|
||||
for metric in evaluation_run_config.metrics:
|
||||
if isinstance(
|
||||
metric, metrics_base._ModelBasedMetric # pylint: disable=protected-access
|
||||
):
|
||||
for variable in prompt_template_base.PromptTemplate(
|
||||
metric.metric_prompt_template
|
||||
).variables:
|
||||
if (
|
||||
evaluation_run_config.metric_column_mapping.get(variable, "")
|
||||
not in evaluation_run_config.dataset.columns
|
||||
):
|
||||
raise ValueError(
|
||||
f"Cannot find the `{variable}` column in the evaluation"
|
||||
" dataset to fill the metric prompt template for"
|
||||
f" `{str(metric)}` metric. Please check if the column is"
|
||||
" present in the evaluation dataset, or provide a"
|
||||
" key-value pair in `metric_column_mapping` parameter"
|
||||
" of `EvalTask` to map it to a different column name."
|
||||
" The evaluation dataset columns are"
|
||||
f" {list(evaluation_run_config.dataset.columns)}."
|
||||
)
|
||||
|
||||
|
||||
def _validate_dataset(
|
||||
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
||||
) -> None:
|
||||
"""Validates the required columns exists in the dataset."""
|
||||
_validate_response_column_required(evaluation_run_config)
|
||||
_validate_reference_column_required(evaluation_run_config)
|
||||
_validate_reference_or_source_column_required(evaluation_run_config)
|
||||
|
||||
|
||||
def _validate_response_column_required(
|
||||
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
||||
) -> None:
|
||||
"""Validates the response column exists in the dataset."""
|
||||
for metric in evaluation_run_config.metrics:
|
||||
if metric in constants.Metric.AUTOMATIC_METRIC_LIST or isinstance(
|
||||
metric, metrics_base._TranslationMetric # pylint: disable=protected-access
|
||||
):
|
||||
_validate_column_provided(
|
||||
evaluation_run_config,
|
||||
constants.Dataset.MODEL_RESPONSE_COLUMN,
|
||||
)
|
||||
|
||||
|
||||
def _validate_reference_column_required(
|
||||
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
||||
) -> None:
|
||||
"""Validates the reference column exists in the dataset."""
|
||||
if set(evaluation_run_config.metrics).intersection(
|
||||
set(constants.Metric.AUTOMATIC_METRIC_LIST)
|
||||
):
|
||||
_validate_column_provided(
|
||||
evaluation_run_config,
|
||||
constants.Dataset.REFERENCE_COLUMN,
|
||||
)
|
||||
|
||||
|
||||
def _validate_column_provided(
|
||||
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
||||
column_name: str,
|
||||
) -> None:
|
||||
"""Validates the required column exist in the dataset."""
|
||||
if column_name not in evaluation_run_config.metric_column_mapping:
|
||||
evaluation_run_config.metric_column_mapping[column_name] = column_name
|
||||
evaluation_run_config.validate_dataset_column(column_name)
|
||||
|
||||
|
||||
def _validate_reference_or_source_column_required(
|
||||
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
||||
) -> None:
|
||||
"""Validates one of reference or source columns exist in the dataset."""
|
||||
for metric in evaluation_run_config.metrics:
|
||||
if isinstance(
|
||||
metric, metrics_base._TranslationMetric # pylint: disable=protected-access
|
||||
):
|
||||
# Validate the reference column.
|
||||
# This is optional if source column is provided.
|
||||
try:
|
||||
_validate_column_provided(
|
||||
evaluation_run_config,
|
||||
constants.Dataset.REFERENCE_COLUMN,
|
||||
)
|
||||
except KeyError:
|
||||
# Reference column is optional. Checking for source column.
|
||||
_validate_column_provided(
|
||||
evaluation_run_config,
|
||||
constants.Dataset.SOURCE_COLUMN,
|
||||
)
|
||||
|
||||
|
||||
def _compute_custom_metrics(
|
||||
row_dict: Dict[str, Any],
|
||||
custom_metrics: List[metrics_base.CustomMetric],
|
||||
pbar: tqdm,
|
||||
executor: futures.ThreadPoolExecutor,
|
||||
) -> Dict[str, Any]:
|
||||
"""Computes custom metrics for a row.
|
||||
|
||||
Args:
|
||||
row_dict: A dictionary of an instance in the eval dataset.
|
||||
custom_metrics: A list of CustomMetrics.
|
||||
pbar: A tqdm progress bar.
|
||||
executor: A thread pool executor.
|
||||
|
||||
Returns:
|
||||
A dictionary of an instance containing custom metric results.
|
||||
|
||||
Raises:
|
||||
KeyError: If the custom metric function does not return a valid output.
|
||||
"""
|
||||
futures_by_metric = collections.defaultdict(list)
|
||||
for custom_metric in custom_metrics:
|
||||
future = executor.submit(custom_metric.metric_function, row_dict)
|
||||
future.add_done_callback(lambda _: pbar.update(1))
|
||||
futures_by_metric[custom_metric].append(future)
|
||||
|
||||
for custom_metric, futures_list in futures_by_metric.items():
|
||||
for future in futures_list:
|
||||
metric_output = future.result()
|
||||
try:
|
||||
row_dict[
|
||||
f"{custom_metric.name}/{constants.MetricResult.SCORE_KEY}"
|
||||
] = metric_output[custom_metric.name]
|
||||
except KeyError:
|
||||
raise KeyError(
|
||||
f"Custom metric score `{custom_metric.name}` not found in"
|
||||
f" the metric output {metric_output}. Please make sure the"
|
||||
" custom metric function is valid, and the output"
|
||||
f" dictionary uses `{custom_metric.name}` as the key for"
|
||||
" metric score."
|
||||
)
|
||||
# Include additional metric results like explanation.
|
||||
for key, value in metric_output.items():
|
||||
if key != custom_metric.name:
|
||||
row_dict[f"{custom_metric.name}/{key}"] = value
|
||||
return row_dict
|
||||
|
||||
|
||||
def _separate_custom_metrics(
|
||||
metrics: List[Union[str, metrics_base._Metric]],
|
||||
) -> Tuple[List[Union[str, metrics_base._Metric]], List[metrics_base.CustomMetric],]:
|
||||
"""Separates the metrics list into API and custom metrics."""
|
||||
custom_metrics = []
|
||||
api_metrics = []
|
||||
for metric in metrics:
|
||||
if isinstance(metric, metrics_base.CustomMetric):
|
||||
custom_metrics.append(metric)
|
||||
else:
|
||||
api_metrics.append(metric)
|
||||
return api_metrics, custom_metrics
|
||||
|
||||
|
||||
def _aggregate_summary_metrics(
|
||||
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
||||
metrics_table: "pd.DataFrame",
|
||||
) -> Dict[str, Any]:
|
||||
"""Computes summary metrics.
|
||||
|
||||
Args:
|
||||
evaluation_run_config: Evaluation Run Configurations.
|
||||
metrics_table: A dataframe containing per-instance metrics results.
|
||||
|
||||
Returns:
|
||||
A dictionary containing summary metrics results and statistics.
|
||||
"""
|
||||
summary_metrics = {}
|
||||
summary_metrics[constants.MetricResult.ROW_COUNT_KEY] = metrics_table.shape[0]
|
||||
|
||||
for metric in evaluation_run_config.metrics:
|
||||
try:
|
||||
if isinstance(metric, pairwise_metric.PairwiseMetric):
|
||||
summary_metrics[f"{metric.metric_name}/candidate_model_win_rate"] = (
|
||||
metrics_table[
|
||||
f"{metric.metric_name}/{constants.MetricResult.PAIRWISE_CHOICE_KEY}"
|
||||
]
|
||||
== "CANDIDATE"
|
||||
).mean()
|
||||
summary_metrics[f"{metric.metric_name}/baseline_model_win_rate"] = (
|
||||
metrics_table[
|
||||
f"{metric.metric_name}/{constants.MetricResult.PAIRWISE_CHOICE_KEY}"
|
||||
]
|
||||
== "BASELINE"
|
||||
).mean()
|
||||
else:
|
||||
summary_metrics[f"{str(metric)}/mean"] = metrics_table.loc[
|
||||
:, f"{str(metric)}/{constants.MetricResult.SCORE_KEY}"
|
||||
].mean()
|
||||
summary_metrics[f"{str(metric)}/std"] = metrics_table.loc[
|
||||
:, f"{str(metric)}/{constants.MetricResult.SCORE_KEY}"
|
||||
].std()
|
||||
except (ValueError, KeyError) as e:
|
||||
_LOGGER.warning(
|
||||
f"Failed to compute metric statistics for `{metric}` metric."
|
||||
f"{type(e).__name__}: {e}"
|
||||
)
|
||||
continue
|
||||
return summary_metrics
|
||||
|
||||
|
||||
def _generate_content_text_response(
|
||||
model: generative_models.GenerativeModel, prompt: str, max_retries: int = 3
|
||||
) -> str:
|
||||
"""Generates a text response from Gemini model from a text prompt with retries.
|
||||
|
||||
Args:
|
||||
model: The Gemini model instance.
|
||||
prompt: The prompt to send to the model.
|
||||
max_retries: Maximum number of retries for response generation.
|
||||
|
||||
Returns:
|
||||
The text response from the model.
|
||||
Returns constants.RESPONSE_ERROR if there is an error after all retries.
|
||||
"""
|
||||
for retry_attempt in range(max_retries):
|
||||
try:
|
||||
response = model.generate_content(prompt)
|
||||
if not response.candidates:
|
||||
error_message = (
|
||||
f"The model response was blocked due to"
|
||||
f" {response._raw_response.prompt_feedback.block_reason.name}.\n"
|
||||
f"Blocked reason message:"
|
||||
f" {response._raw_response.prompt_feedback.block_reason_message}.\n"
|
||||
"The input prompt may be blocked for safety reasons.\n"
|
||||
f"Prompt: {prompt}.\n"
|
||||
f"Retry attempt: {retry_attempt + 1}/{max_retries}"
|
||||
)
|
||||
_LOGGER.warning(error_message)
|
||||
break
|
||||
else:
|
||||
candidate = response.candidates[0]
|
||||
if candidate.finish_reason not in _SUCCESSFUL_FINISH_REASONS:
|
||||
error_message = (
|
||||
"The model response did not finish"
|
||||
" successfully.\n"
|
||||
f"Finish reason: {candidate.finish_reason}.\n"
|
||||
f"Finish message: {candidate.finish_message}.\n"
|
||||
f"Safety ratings: {candidate.safety_ratings}.\n"
|
||||
"Please adjust the model safety_settings, or"
|
||||
" try a different prompt.\n"
|
||||
f"Retry attempt: {retry_attempt + 1}/{max_retries}"
|
||||
)
|
||||
_LOGGER.warning(error_message)
|
||||
else:
|
||||
return response.candidates[0].content.parts[0].text
|
||||
except Exception as e:
|
||||
error_message = (
|
||||
f"Failed to generate response candidates from Gemini model"
|
||||
f" {model._model_name}.\n"
|
||||
f"Error: {e}.\n"
|
||||
f"Prompt: {prompt}.\n"
|
||||
f"Retry attempt: {retry_attempt + 1}/{max_retries}"
|
||||
)
|
||||
_LOGGER.warning(error_message)
|
||||
if retry_attempt < max_retries - 1:
|
||||
_LOGGER.info(
|
||||
f"Retrying response generation for prompt: {prompt}, attempt"
|
||||
f" {retry_attempt + 1}/{max_retries}..."
|
||||
)
|
||||
|
||||
final_error_message = (
|
||||
f"Failed to generate response from Gemini model {model._model_name}.\n"
|
||||
f"Prompt: {prompt}."
|
||||
)
|
||||
_LOGGER.warning(final_error_message)
|
||||
return constants.RESPONSE_ERROR
|
||||
|
||||
|
||||
def _generate_responses_from_gemini_model(
|
||||
model: generative_models.GenerativeModel,
|
||||
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
||||
is_baseline_model: bool = False,
|
||||
) -> None:
|
||||
"""Generates responses from Gemini model.
|
||||
|
||||
Args:
|
||||
model: The Gemini model instance.
|
||||
evaluation_run_config: Evaluation Run Configurations.
|
||||
is_baseline_model: Whether the model is a baseline model for PairwiseMetric.
|
||||
"""
|
||||
# Ensure thread safety and avoid race conditions.
|
||||
df = evaluation_run_config.dataset.copy()
|
||||
|
||||
_LOGGER.info(
|
||||
f"Generating a total of {evaluation_run_config.dataset.shape[0]} "
|
||||
f"responses from Gemini model {model._model_name.split('/')[-1]}."
|
||||
)
|
||||
tasks = []
|
||||
with tqdm(total=len(df)) as pbar:
|
||||
with futures.ThreadPoolExecutor(max_workers=constants.MAX_WORKERS) as executor:
|
||||
for _, row in df.iterrows():
|
||||
task = executor.submit(
|
||||
_generate_content_text_response,
|
||||
prompt=row[constants.Dataset.PROMPT_COLUMN],
|
||||
model=model,
|
||||
)
|
||||
task.add_done_callback(lambda _: pbar.update(1))
|
||||
tasks.append(task)
|
||||
responses = [future.result() for future in tasks]
|
||||
if is_baseline_model:
|
||||
evaluation_run_config.dataset = df.assign(baseline_model_response=responses)
|
||||
else:
|
||||
evaluation_run_config.dataset = df.assign(response=responses)
|
||||
|
||||
_LOGGER.info(
|
||||
f"All {evaluation_run_config.dataset.shape[0]} responses are successfully"
|
||||
f" generated from Gemini model {model._model_name.split('/')[-1]}."
|
||||
)
|
||||
|
||||
|
||||
def _generate_response_from_custom_model_fn(
|
||||
model_fn: Callable[[str], str],
|
||||
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
||||
is_baseline_model: bool = False,
|
||||
) -> None:
|
||||
"""Generates responses from a custom model function.
|
||||
|
||||
Args:
|
||||
model_fn: The custom model function.
|
||||
evaluation_run_config: Evaluation Run Configurations.
|
||||
is_baseline_model: Whether the model is a baseline model for
|
||||
PairwiseMetric.
|
||||
"""
|
||||
eval_dataset = evaluation_run_config.dataset.copy()
|
||||
max_workers = 5
|
||||
|
||||
_LOGGER.info(
|
||||
f"Generating a total of {evaluation_run_config.dataset.shape[0]} "
|
||||
"responses from the custom model function."
|
||||
)
|
||||
tasks = []
|
||||
try:
|
||||
with tqdm(total=len(eval_dataset)) as pbar:
|
||||
with futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
for _, row in eval_dataset.iterrows():
|
||||
task = executor.submit(
|
||||
model_fn, row[constants.Dataset.PROMPT_COLUMN]
|
||||
)
|
||||
task.add_done_callback(lambda _: pbar.update(1))
|
||||
tasks.append(task)
|
||||
except (ValueError, IndexError) as e:
|
||||
_LOGGER.warning(f"Failed to generate response from model function: {e}")
|
||||
|
||||
responses = [task.result() for task in tasks]
|
||||
if is_baseline_model:
|
||||
evaluation_run_config.dataset = eval_dataset.assign(
|
||||
baseline_model_response=responses
|
||||
)
|
||||
else:
|
||||
evaluation_run_config.dataset = eval_dataset.assign(response=responses)
|
||||
|
||||
_LOGGER.info(
|
||||
f"All {evaluation_run_config.dataset.shape[0]} responses are successfully"
|
||||
" generated from the custom model function."
|
||||
)
|
||||
|
||||
|
||||
def _run_model_inference(
|
||||
model: Union[generative_models.GenerativeModel, Callable[[str], str]],
|
||||
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
||||
response_column_name: str = constants.Dataset.MODEL_RESPONSE_COLUMN,
|
||||
) -> None:
|
||||
"""Runs model inference on dataset for evaluation.
|
||||
|
||||
Args:
|
||||
model: The model or baseline model or a custom model function to
|
||||
generate responses to evaluate.
|
||||
evaluation_run_config: Evaluation Run Configurations.
|
||||
response_column_name: Column name key in metric_column_mapping. Value is
|
||||
constants.Dataset.MODEL_RESPONSE_COLUMN or
|
||||
constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN.
|
||||
|
||||
Raises:
|
||||
ValueError: If the model or baseline model is not supported.
|
||||
"""
|
||||
is_baseline_model = (
|
||||
response_column_name == constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN
|
||||
)
|
||||
if model:
|
||||
if response_column_name not in evaluation_run_config.metric_column_mapping:
|
||||
if constants.Dataset.PROMPT_COLUMN in evaluation_run_config.dataset.columns:
|
||||
t1 = time.perf_counter()
|
||||
if isinstance(model, generative_models.GenerativeModel):
|
||||
_generate_responses_from_gemini_model(
|
||||
model, evaluation_run_config, is_baseline_model
|
||||
)
|
||||
elif callable(model):
|
||||
_generate_response_from_custom_model_fn(
|
||||
model, evaluation_run_config, is_baseline_model
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported model or baseline model type: {type(model)}"
|
||||
)
|
||||
t2 = time.perf_counter()
|
||||
_LOGGER.info(f"Multithreaded Batch Inference took: {t2 - t1} seconds.")
|
||||
evaluation_run_config.metric_column_mapping[
|
||||
response_column_name
|
||||
] = response_column_name
|
||||
else:
|
||||
raise ValueError(
|
||||
"Missing required input `prompt` column to start model inference."
|
||||
" Please provide a `prompt_template` parameter in"
|
||||
" `EvalTask.evaluate()` function if you want to assemble a"
|
||||
" `prompt` column with variables from the dataset, or provide a"
|
||||
" `prompt` column in dataset to directly use as input to"
|
||||
" the model. Mappings in `metric_column_mapping` do not"
|
||||
" apply for model inference and are used for evaluation only."
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"The `model` parameter or `baseline_model` in pairwise metric is"
|
||||
" specified, but the evaluation `dataset` contains model response"
|
||||
" column or baseline model response column"
|
||||
f" `{evaluation_run_config.metric_column_mapping[response_column_name]}`"
|
||||
" to perform bring-your-own-response(BYOR) evaluation. If you would"
|
||||
" like to perform evaluation using the dataset with the"
|
||||
" existing model response column or or baseline model response column"
|
||||
f" `{evaluation_run_config.metric_column_mapping[response_column_name]}`,"
|
||||
" please remove `model` parameter in `EvalTask.evaluate()`"
|
||||
" function or `baseline_model` in `PairwiseMetric`."
|
||||
)
|
||||
|
||||
|
||||
def _check_variable_columns_exist(
|
||||
dataset: "pd.DataFrame", variable_names_set: Set[str]
|
||||
) -> None:
|
||||
"""Checks if all variable names exist in the dataset columns.
|
||||
|
||||
Args:
|
||||
dataset: The dataset to evaluate.
|
||||
variable_names_set: A set of variable names.
|
||||
|
||||
Raises:
|
||||
ValueError: If any variable names do not exist in the dataset columns
|
||||
or the prompt template is invalid.
|
||||
"""
|
||||
actual_column_names_set = set(dataset.columns)
|
||||
if not variable_names_set.issubset(actual_column_names_set):
|
||||
missing_columns = variable_names_set - actual_column_names_set
|
||||
raise ValueError(
|
||||
"Failed to assemble prompt template: The following column(s) are"
|
||||
f" missing: {', '.join(missing_columns)}. "
|
||||
f"Please verify prompt_template variables {variable_names_set} and "
|
||||
f"evaluation dataset column names {actual_column_names_set}."
|
||||
)
|
||||
|
||||
|
||||
def _assemble_prompt_for_dataset(
|
||||
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
||||
prompt_template: Union[prompt_template_base.PromptTemplate, str],
|
||||
) -> None:
|
||||
"""Assembles a prompt column in metrics_table from variable columns.
|
||||
|
||||
Args:
|
||||
evaluation_run_config: Evaluation Run Configurations.
|
||||
prompt_template: A `PromptTemplate` object or a prompt template string
|
||||
with variables that can be assembled from the evaluation dataset. The
|
||||
variables can be represented in curly braces `{variable}`, and
|
||||
must be included in the dataset columns if specified. The variable
|
||||
names cannot contain spaces.
|
||||
|
||||
Returns:
|
||||
The assembled prompt template string to send to the model.
|
||||
|
||||
Raises:
|
||||
ValueError: If any variable names do not exist in the dataset columns
|
||||
or the prompt template is invalid.
|
||||
"""
|
||||
if not prompt_template:
|
||||
raise ValueError("Prompt template cannot be an empty string.")
|
||||
|
||||
_LOGGER.info(
|
||||
"Assembling prompts from the `prompt_template`. The `prompt` column in"
|
||||
" the `EvalResult.metrics_table` has the assembled prompts used for model"
|
||||
" response generation."
|
||||
)
|
||||
if isinstance(prompt_template, str):
|
||||
prompt_template = prompt_template_base.PromptTemplate(prompt_template)
|
||||
_check_variable_columns_exist(
|
||||
evaluation_run_config.dataset, prompt_template.variables
|
||||
)
|
||||
|
||||
try:
|
||||
evaluation_run_config.dataset[
|
||||
constants.Dataset.PROMPT_COLUMN
|
||||
] = evaluation_run_config.dataset.apply(
|
||||
lambda row: str(
|
||||
prompt_template.assemble(
|
||||
**row[list(prompt_template.variables)].astype(str).to_dict(),
|
||||
)
|
||||
),
|
||||
axis=1,
|
||||
)
|
||||
if (
|
||||
constants.Dataset.PROMPT_COLUMN
|
||||
in evaluation_run_config.metric_column_mapping
|
||||
and evaluation_run_config.metric_column_mapping[
|
||||
constants.Dataset.PROMPT_COLUMN
|
||||
]
|
||||
!= constants.Dataset.PROMPT_COLUMN
|
||||
):
|
||||
_LOGGER.warning(
|
||||
"The `prompt` column mapping provided in"
|
||||
" `metric_column_mapping` parameter is overwritten by the"
|
||||
" assembled `prompt` column because the `prompt_template`"
|
||||
" parameter is provided. Please verify that you want to use"
|
||||
" the assembled `prompt` column for evaluation."
|
||||
)
|
||||
evaluation_run_config.metric_column_mapping[
|
||||
constants.Dataset.PROMPT_COLUMN
|
||||
] = constants.Dataset.PROMPT_COLUMN
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
f"Failed to assemble prompt template: {e}. Please make sure all"
|
||||
" variables in `prompt_template` are present in the evaluation"
|
||||
f" dataset columns: `{list(evaluation_run_config.dataset.columns)}`."
|
||||
) from e
|
||||
|
||||
|
||||
def _set_metric_table(
|
||||
metric_name: str,
|
||||
metric_results: Any,
|
||||
metrics_table: "pd.DataFrame",
|
||||
metric_result_key: str,
|
||||
):
|
||||
"""Parses value from metric results to metrics_table."""
|
||||
if metric_result_key == constants.MetricResult.SCORE_KEY:
|
||||
metric_result_items = [
|
||||
result.get(metric_result_key) if isinstance(result, dict) else None
|
||||
for result in metric_results
|
||||
]
|
||||
else:
|
||||
metric_result_items = [
|
||||
result.get(metric_result_key) if isinstance(result, dict) else "Error"
|
||||
for result in metric_results
|
||||
]
|
||||
metrics_table[f"{metric_name}/{metric_result_key}"] = metric_result_items
|
||||
|
||||
|
||||
def _parse_metric_results_to_dataframe(
|
||||
instance_df: "pd.DataFrame", results: Dict[Union[str, metrics_base._Metric], Any]
|
||||
) -> Dict[str, Any]:
|
||||
"""Parses metric results to a pandas dataframe.
|
||||
|
||||
Args:
|
||||
instance_df: A dataframe containing per-instance metrics results.
|
||||
results: A dictionary containing metric results.
|
||||
|
||||
Returns:
|
||||
A dataframe containing per-instance metrics results. Each metric result
|
||||
can contain metric score, explanation, and confidence.
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'Pandas is not installed. Please install the SDK using "pip install'
|
||||
' google-cloud-aiplatform[evaluation]"'
|
||||
)
|
||||
|
||||
metrics_table = pd.DataFrame(dict(zip(instance_df.columns, instance_df.values.T)))
|
||||
for metric, metric_results in results.items():
|
||||
if isinstance(metric, pointwise_metric.PointwiseMetric):
|
||||
_set_metric_table(
|
||||
metric.metric_name,
|
||||
metric_results,
|
||||
metrics_table,
|
||||
constants.MetricResult.EXPLANATION_KEY,
|
||||
)
|
||||
_set_metric_table(
|
||||
metric.metric_name,
|
||||
metric_results,
|
||||
metrics_table,
|
||||
constants.MetricResult.SCORE_KEY,
|
||||
)
|
||||
elif isinstance(metric, pairwise_metric.PairwiseMetric):
|
||||
_set_metric_table(
|
||||
metric.metric_name,
|
||||
metric_results,
|
||||
metrics_table,
|
||||
constants.MetricResult.EXPLANATION_KEY,
|
||||
)
|
||||
_set_metric_table(
|
||||
metric.metric_name,
|
||||
metric_results,
|
||||
metrics_table,
|
||||
constants.MetricResult.PAIRWISE_CHOICE_KEY,
|
||||
)
|
||||
elif str(metric) in constants.Metric.AUTOMATIC_METRIC_LIST:
|
||||
_set_metric_table(
|
||||
str(metric),
|
||||
metric_results,
|
||||
metrics_table,
|
||||
constants.MetricResult.SCORE_KEY,
|
||||
)
|
||||
elif isinstance(
|
||||
metric, metrics_base._TranslationMetric # pylint: disable=protected-access
|
||||
):
|
||||
_set_metric_table(
|
||||
str(metric),
|
||||
metric_results,
|
||||
metrics_table,
|
||||
constants.MetricResult.SCORE_KEY,
|
||||
)
|
||||
else:
|
||||
_LOGGER.warning(
|
||||
f"Metric name: {str(metric)} is not supported when parsing"
|
||||
" metric results."
|
||||
)
|
||||
|
||||
return metrics_table
|
||||
|
||||
|
||||
def _compute_metrics(
|
||||
evaluation_run_config: evaluation_base.EvaluationRunConfig,
|
||||
) -> Tuple[Dict[str, Any], "pd.DataFrame"]:
|
||||
"""Computes the metrics for the dataset.
|
||||
|
||||
Args:
|
||||
evaluation_run_config: Evaluation Run Configurations.
|
||||
|
||||
Returns:
|
||||
The evaluation results for the input metrics.
|
||||
|
||||
Raises:
|
||||
RuntimeError: The number of responses does not match the number of metrics.
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'Pandas is not installed. Please install the SDK using "pip install'
|
||||
' google-cloud-aiplatform[evaluation]"'
|
||||
)
|
||||
|
||||
api_metrics, custom_metrics = _separate_custom_metrics(
|
||||
evaluation_run_config.metrics
|
||||
)
|
||||
row_count = len(evaluation_run_config.dataset)
|
||||
api_request_count = len(api_metrics) * row_count
|
||||
custom_metric_request_count = len(custom_metrics) * row_count
|
||||
total_request_count = api_request_count + custom_metric_request_count
|
||||
|
||||
_LOGGER.info(
|
||||
f"Computing metrics with a total of {total_request_count} Vertex Gen AI"
|
||||
" Evaluation Service API requests."
|
||||
)
|
||||
|
||||
instance_list = []
|
||||
futures_by_metric = collections.defaultdict(list)
|
||||
rate_limiter = utils.RateLimiter(evaluation_run_config.evaluation_service_qps)
|
||||
with tqdm(total=total_request_count) as pbar:
|
||||
with futures.ThreadPoolExecutor(max_workers=constants.MAX_WORKERS) as executor:
|
||||
for idx, row in evaluation_run_config.dataset.iterrows():
|
||||
row_dict = _compute_custom_metrics(
|
||||
row.to_dict(), custom_metrics, pbar, executor
|
||||
)
|
||||
instance_list.append(row_dict)
|
||||
for metric in api_metrics:
|
||||
future = executor.submit(
|
||||
_instance_evaluation.evaluate_instances,
|
||||
client=evaluation_run_config.client,
|
||||
request=_instance_evaluation.build_request(
|
||||
metric=metric,
|
||||
row_dict=row_dict,
|
||||
evaluation_run_config=evaluation_run_config,
|
||||
),
|
||||
rate_limiter=rate_limiter,
|
||||
retry_timeout=evaluation_run_config.retry_timeout,
|
||||
)
|
||||
future.add_done_callback(lambda _: pbar.update(1))
|
||||
futures_by_metric[metric].append((future, idx))
|
||||
|
||||
# Retrieve results from all futures and handle errors.
|
||||
results_dict = collections.defaultdict(list)
|
||||
error_list = []
|
||||
for metric, futures_list in futures_by_metric.items():
|
||||
for future, index in futures_list:
|
||||
try:
|
||||
response = future.result()
|
||||
results_dict[metric].append(response)
|
||||
except Exception as e:
|
||||
results_dict[metric].append("Error")
|
||||
error_list.append((metric, index, f"Error: {e}"))
|
||||
|
||||
for metric, responses in results_dict.items():
|
||||
results_dict[metric] = [
|
||||
_instance_evaluation.handle_response(response) for response in responses
|
||||
]
|
||||
if error_list:
|
||||
_LOGGER.warning(
|
||||
f"{len(error_list)} errors encountered during evaluation. Continue to"
|
||||
" compute summary metrics for the rest of the dataset."
|
||||
)
|
||||
for metric_name, index, error in error_list:
|
||||
_LOGGER.warning(
|
||||
f"Error encountered for metric {metric_name} at dataset index"
|
||||
f" {index}: {error}"
|
||||
)
|
||||
else:
|
||||
_LOGGER.info(
|
||||
f"All {total_request_count} metric requests are successfully computed."
|
||||
)
|
||||
|
||||
instance_df = pd.DataFrame.from_dict(instance_list)
|
||||
metrics_table = _parse_metric_results_to_dataframe(instance_df, results_dict)
|
||||
|
||||
# Aggregate the summary metrics.
|
||||
summary_metrics = _aggregate_summary_metrics(evaluation_run_config, metrics_table)
|
||||
|
||||
return evaluation_base.EvalResult(
|
||||
summary_metrics=summary_metrics, metrics_table=metrics_table
|
||||
)
|
||||
|
||||
|
||||
def _get_baseline_model(evaluation_run_config: evaluation_base.EvaluationRunConfig):
|
||||
"""Gets the baseline model from the pairwise metrics."""
|
||||
pairwise_metric_instances = [
|
||||
metric
|
||||
for metric in evaluation_run_config.metrics
|
||||
if isinstance(metric, pairwise_metric.PairwiseMetric)
|
||||
]
|
||||
baseline_models = {
|
||||
instance.metric_name: instance.baseline_model
|
||||
for instance in pairwise_metric_instances
|
||||
}
|
||||
if len(set(baseline_models.values())) > 1:
|
||||
raise ValueError(
|
||||
"Not all `PairwiseMetric` instances have the same `baseline_model`. "
|
||||
f"Here are the detected baseline models: `{baseline_models}`. "
|
||||
"Please separate pairwise metrics with different baseline models "
|
||||
"in different `EvalTask` or use the same baseline model for "
|
||||
"all pairwise metrics."
|
||||
)
|
||||
return pairwise_metric_instances[0].baseline_model
|
||||
|
||||
|
||||
def _convert_metric_prompt_template_example(metrics):
|
||||
"""Converts string metric names to generic model-based metric instances."""
|
||||
updated_metrics = []
|
||||
for metric in metrics:
|
||||
if metric in constants.Metric.POINTWISE_METRIC_PROMPT_TEMPLATE_EXAMPLE_LIST:
|
||||
template = metric_prompt_template_examples.MetricPromptTemplateExamples.get_prompt_template(
|
||||
metric
|
||||
)
|
||||
metric = pointwise_metric.PointwiseMetric(
|
||||
metric=metric, metric_prompt_template=template
|
||||
)
|
||||
elif metric in constants.Metric.PAIRWISE_METRIC_PROMPT_TEMPLATE_EXAMPLE_LIST:
|
||||
template = metric_prompt_template_examples.MetricPromptTemplateExamples.get_prompt_template(
|
||||
metric
|
||||
)
|
||||
metric = pairwise_metric.PairwiseMetric(
|
||||
metric=metric, metric_prompt_template=template
|
||||
)
|
||||
_LOGGER.info(
|
||||
f"Pairwise metric `{metric.metric_name}` loaded from"
|
||||
" `MetricPromptTemplateExamples` does not have `baseline_model`"
|
||||
" specified and only supports Bring-Your-Own-Response(BYOR)"
|
||||
" evaluation. If you would like to run inference on the baseline model,"
|
||||
" please instantiate a `PairwiseMetric` and provide the"
|
||||
" `baseline_model` parameter."
|
||||
)
|
||||
updated_metrics.append(metric)
|
||||
return updated_metrics
|
||||
|
||||
|
||||
def evaluate(
|
||||
dataset: "pd.DataFrame",
|
||||
metrics: List[Union[str, metrics_base._Metric]],
|
||||
*,
|
||||
model: Optional[
|
||||
Union[generative_models.GenerativeModel, Callable[[str], str]]
|
||||
] = None,
|
||||
prompt_template: Optional[Union[str, prompt_template_base.PromptTemplate]] = None,
|
||||
metric_column_mapping: Dict[str, str],
|
||||
evaluation_service_qps: Optional[float] = None,
|
||||
retry_timeout: float = 600.0,
|
||||
) -> evaluation_base.EvalResult:
|
||||
"""Runs the evaluation for metrics.
|
||||
|
||||
Args:
|
||||
dataset: The dataset to evaluate.
|
||||
metrics: The list of metric names, or Metric instances to
|
||||
evaluate. Prompt template is required for PairwiseMetric.
|
||||
model: The GenerativeModel instance or a custom model function to generate
|
||||
responses to evaluate. If not provided, the evaluation is computed with
|
||||
the `response` column in the `dataset`.
|
||||
prompt_template: A `PromptTemplate` or a prompt template string compatible
|
||||
with `PromptTemplate` class with variables that can be formatted with
|
||||
dataset columns to create assembled prompts. The variables can be
|
||||
represented in curly braces `{variable_name}`, and must be included in the
|
||||
dataset columns if specified. The variable names cannot contain spaces.
|
||||
metric_column_mapping: An optional dictionary column mapping that
|
||||
overrides the metric prompt template input variable names with
|
||||
mapped the evaluation dataset column names, used during evaluation.
|
||||
For example, if the input_variables of the metric prompt template
|
||||
are ["context", "reference"], the metric_column_mapping can be
|
||||
{
|
||||
"context": "news_context",
|
||||
"reference": "ground_truth",
|
||||
"response": "model_1_response"
|
||||
}
|
||||
if the dataset has columns "news_context", "ground_truth" and
|
||||
"model_1_response".
|
||||
evaluation_service_qps: The custom QPS limit for the evaluation service.
|
||||
retry_timeout: How long to keep retrying the evaluation requests for the
|
||||
whole evaluation dataset, in seconds.
|
||||
|
||||
Returns:
|
||||
EvalResult with summary metrics and a metrics table for per-instance
|
||||
metrics.
|
||||
|
||||
Raises:
|
||||
ValueError: If the metrics list is empty, or the prompt template is not
|
||||
provided for PairwiseMetric, or multiple baseline models are specified for
|
||||
PairwiseMetric instances, or both model and dataset model response column
|
||||
are present.
|
||||
"""
|
||||
_validate_metrics(metrics)
|
||||
metrics = _convert_metric_prompt_template_example(metrics)
|
||||
copied_metrics = []
|
||||
for metric in metrics:
|
||||
if isinstance(metric, pairwise_metric.PairwiseMetric):
|
||||
copied_metrics.append(
|
||||
pairwise_metric.PairwiseMetric(
|
||||
metric=metric.metric_name,
|
||||
metric_prompt_template=metric.metric_prompt_template,
|
||||
baseline_model=metric.baseline_model,
|
||||
)
|
||||
)
|
||||
else:
|
||||
copied_metrics.append(copy.deepcopy(metric))
|
||||
evaluation_run_config = evaluation_base.EvaluationRunConfig(
|
||||
dataset=dataset.copy(deep=True),
|
||||
metrics=copied_metrics,
|
||||
metric_column_mapping=copy.deepcopy(metric_column_mapping),
|
||||
client=utils.create_evaluation_service_client(),
|
||||
evaluation_service_qps=(
|
||||
evaluation_service_qps
|
||||
if evaluation_service_qps
|
||||
else constants.QuotaLimit.EVAL_SERVICE_QPS
|
||||
),
|
||||
retry_timeout=retry_timeout,
|
||||
)
|
||||
|
||||
if prompt_template:
|
||||
_assemble_prompt_for_dataset(evaluation_run_config, prompt_template)
|
||||
|
||||
_run_model_inference(
|
||||
model=model,
|
||||
evaluation_run_config=evaluation_run_config,
|
||||
response_column_name=constants.Dataset.MODEL_RESPONSE_COLUMN,
|
||||
)
|
||||
_validate_dataset(evaluation_run_config)
|
||||
|
||||
pairwise_metric_exists = any(
|
||||
isinstance(metric, pairwise_metric.PairwiseMetric)
|
||||
for metric in evaluation_run_config.metrics
|
||||
)
|
||||
if pairwise_metric_exists:
|
||||
baseline_model = _get_baseline_model(evaluation_run_config)
|
||||
_run_model_inference(
|
||||
model=baseline_model,
|
||||
evaluation_run_config=evaluation_run_config,
|
||||
response_column_name=constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
|
||||
)
|
||||
|
||||
_validate_metric_column_map(evaluation_run_config)
|
||||
t1 = time.perf_counter()
|
||||
evaluation_result = _compute_metrics(evaluation_run_config)
|
||||
t2 = time.perf_counter()
|
||||
_LOGGER.info(f"Evaluation Took:{t2 - t1} seconds")
|
||||
|
||||
return evaluation_result
|
||||
Reference in New Issue
Block a user