# -*- coding: utf-8 -*- # Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import logging from typing import Any, Callable, Dict, List, Literal, Optional, TYPE_CHECKING, Union import uuid import warnings from google.api_core import exceptions import vertexai from google.cloud.aiplatform import base from google.cloud.aiplatform import utils from google.cloud.aiplatform.metadata import metadata from vertexai import generative_models from vertexai.evaluation import _base as eval_base from vertexai.evaluation import _evaluation from vertexai.evaluation import constants from vertexai.evaluation import utils as eval_utils from vertexai.evaluation.metrics import ( _base as metrics_base, ) from vertexai.evaluation.metrics import ( pairwise_metric, ) from vertexai.evaluation.metrics import ( pointwise_metric, ) import numpy as np if TYPE_CHECKING: import pandas as pd # pylint: disable=g-import-not-at-top try: from IPython import display as IPython_display except ImportError: IPython_display = None _LOGGER = base.Logger(__name__) logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR) warnings.filterwarnings("ignore") EvalResult = eval_base.EvalResult GenerativeModel = generative_models.GenerativeModel class EvalTask: """A class representing an EvalTask. An evaluation task assesses the ability of a Gen AI model, agent or application to perform a specific task in response to prompts. Each evaluation task includes an evaluation dataset, which can be a set of test cases and a set of metrics for assessment. These tasks provide the framework for running evaluations in a standardized and repeatable way, allowing for comparative assessment with varying run-specific parameters. Dataset Details: Default dataset column names: * prompt_column_name: "prompt" * reference_column_name: "reference" * response_column_name: "response" * baseline_model_response_column_name: "baseline_model_response" * rubrics_column_name: "rubrics" Requirement for different use cases: * Bring-your-own-response (BYOR): You already have the data that you want to evaluate stored in the dataset. Response column name can be customized by providing `response_column_name` parameter, or in the `metric_column_mapping`. For BYOR pairwise evaluation, the baseline model response column name can be customized by providing `baseline_model_response_column_name` parameter, or in the `metric_column_mapping`. If the `response` column or `baseline_model_response` column is present while the corresponding model is specified, an error will be raised. * Perform model/agent inference without a prompt template: You have a dataset containing the input prompts to the model/agent and want to perform inference before evaluation. A column named `prompt` is required in the evaluation dataset and is used directly as input to the model/agent. * Perform model/agent inference with a prompt template: You have a dataset containing the input variables to the prompt template and want to assemble the prompts for inference. Evaluation dataset must contain column names corresponding to the variable names in the prompt template. For example, if prompt template is "Instruction: {instruction}, context: {context}", the dataset must contain `instruction` and `context` columns. Metrics Details: The supported metrics descriptions, rating rubrics, and the required input variables can be found on the Vertex AI public documentation page. [Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval). Usage Examples: 1. To perform bring-your-own-response(BYOR) evaluation, provide the model responses in the `response` column in the dataset. If a pairwise metric is used for BYOR evaluation, provide the baseline model responses in the `baseline_model_response` column. ``` eval_dataset = pd.DataFrame({ "prompt" : [...], "reference": [...], "response" : [...], "baseline_model_response": [...], }) eval_task = EvalTask( dataset=eval_dataset, metrics=[ "bleu", "rouge_l_sum", MetricPromptTemplateExamples.Pointwise.FLUENCY, MetricPromptTemplateExamples.Pairwise.SAFETY ], experiment="my-experiment", ) eval_result = eval_task.evaluate(experiment_run_name="eval-experiment-run") ``` 2. To perform evaluation with Gemini model inference, specify the `model` parameter with a `GenerativeModel` instance. The input column name to the model is `prompt` and must be present in the dataset. ``` eval_dataset = pd.DataFrame({ "reference": [...], "prompt" : [...], }) result = EvalTask( dataset=eval_dataset, metrics=["exact_match", "bleu", "rouge_1", "rouge_l_sum"], experiment="my-experiment", ).evaluate( model=GenerativeModel("gemini-1.5-pro"), experiment_run_name="gemini-eval-run" ) ``` 3. If a `prompt_template` is specified, the `prompt` column is not required. Prompts can be assembled from the evaluation dataset, and all prompt template variable names must be present in the dataset columns. ``` eval_dataset = pd.DataFrame({ "context" : [...], "instruction": [...], }) result = EvalTask( dataset=eval_dataset, metrics=[MetricPromptTemplateExamples.Pointwise.SUMMARIZATION_QUALITY], ).evaluate( model=GenerativeModel("gemini-1.5-pro"), prompt_template="{instruction}. Article: {context}. Summary:", ) ``` 4. To perform evaluation with custom model inference, specify the `model` parameter with a custom inference function. The input column name to the custom inference function is `prompt` and must be present in the dataset. ``` from openai import OpenAI client = OpenAI() def custom_model_fn(input: str) -> str: response = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "user", "content": input} ] ) return response.choices[0].message.content eval_dataset = pd.DataFrame({ "prompt" : [...], "reference": [...], }) result = EvalTask( dataset=eval_dataset, metrics=[MetricPromptTemplateExamples.Pointwise.SAFETY], experiment="my-experiment", ).evaluate( model=custom_model_fn, experiment_run_name="gpt-eval-run" ) ``` 5. To perform pairwise metric evaluation with model inference step, specify the `baseline_model` input to a `PairwiseMetric` instance and the candidate `model` input to the `EvalTask.evaluate()` function. The input column name to both models is `prompt` and must be present in the dataset. ``` baseline_model = GenerativeModel("gemini-1.0-pro") candidate_model = GenerativeModel("gemini-1.5-pro") pairwise_groundedness = PairwiseMetric( metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template( "pairwise_groundedness" ), baseline_model=baseline_model, ) eval_dataset = pd.DataFrame({ "prompt" : [...], }) result = EvalTask( dataset=eval_dataset, metrics=[pairwise_groundedness], experiment="my-pairwise-experiment", ).evaluate( model=candidate_model, experiment_run_name="gemini-pairwise-eval-run", ) ``` """ _resource_noun = "evaluationTasks" def __init__( self, *, dataset: Union["pd.DataFrame", str, Dict[str, Any]], metrics: List[ Union[ Literal[ "exact_match", "bleu", "rouge_1", "rouge_2", "rouge_l", "rouge_l_sum", "tool_call_valid", "tool_name_match", "tool_parameter_key_match", "tool_parameter_kv_match", ], metrics_base.CustomMetric, metrics_base._AutomaticMetric, metrics_base._TranslationMetric, pointwise_metric.PointwiseMetric, pairwise_metric.PairwiseMetric, ] ], experiment: Optional[str] = None, metric_column_mapping: Optional[Dict[str, str]] = None, output_uri_prefix: Optional[str] = "", ): """Initializes an EvalTask. Args: dataset: The dataset to be evaluated. Supports the following dataset formats: * pandas.DataFrame: Used directly for evaluation. * Dict: Converted to a pandas DataFrame before evaluation. * str: Interpreted as a file path or URI. Supported formats include: * Local JSONL or CSV files: Loaded from the local filesystem. * GCS JSONL or CSV files: Loaded from Google Cloud Storage (e.g., 'gs://bucket/data.csv'). * BigQuery table URI: Loaded from Google Cloud BigQuery (e.g., 'bq://project-id.dataset.table_name'). metrics: The list of metric names, or Metric instances to evaluate. Prompt template is required for PairwiseMetric. experiment: The name of the experiment to log the evaluations to. metric_column_mapping: An optional dictionary column mapping that overrides the metric prompt template input variable names with mapped the evaluation dataset column names, used during evaluation. For example, if the input_variables of the metric prompt template are ["context", "reference"], the metric_column_mapping can be { "context": "news_context", "reference": "ground_truth", "response": "model_1_response" } if the dataset has columns "news_context", "ground_truth" and "model_1_response". output_uri_prefix: GCS location to store the metrics_table from evaluation results. """ self._raw_dataset = dataset self._dataset = eval_utils.load_dataset(dataset) self._metrics = metrics self._experiment = experiment self._metric_column_mapping = eval_utils.initialize_metric_column_mapping( metric_column_mapping, self._dataset ) self.output_uri_prefix = output_uri_prefix @property def dataset(self) -> "pd.DataFrame": """Returns evaluation dataset.""" return self._dataset @property def metrics(self) -> List[Union[str, metrics_base.CustomMetric]]: """Returns metrics.""" return self._metrics @property def experiment(self) -> Optional[str]: """Returns experiment name.""" return self._experiment def _evaluate_with_experiment( self, *, model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None, prompt_template: Optional[str] = None, experiment_run_name: Optional[str] = None, evaluation_service_qps: Optional[float] = None, retry_timeout: float = 120.0, output_file_name: Optional[str] = None, ) -> EvalResult: """Runs an evaluation for the EvalTask with an experiment. Args: model: A GenerativeModel instance or a custom model function to generate responses to evaluate. If not provided, the evaluation is computed with the `response` column in the `dataset`. prompt_template: The prompt template to use for the evaluation. If not set, the prompt template that was used to create the EvalTask will be used. experiment_run_name: The name of the experiment run to log the evaluation to if an experiment is set for this EvalTask. If not provided, a random unique experiment run name is used. evaluation_service_qps: The custom QPS limit for the evaluation service. retry_timeout: How long to keep retrying the evaluation requests for the whole evaluation dataset, in seconds. output_file_name: The file name with csv suffix to store the output metrics_table to be tracked in the experiment run. Returns: The evaluation result. """ self._validate_experiment_run() with vertexai.preview.start_run(experiment_run_name): self._log_eval_experiment_param( model=model, prompt_template=prompt_template, output_file_name=output_file_name, ) eval_result = _evaluation.evaluate( dataset=self._dataset, metrics=self._metrics, model=model, prompt_template=prompt_template, metric_column_mapping=self._metric_column_mapping, evaluation_service_qps=evaluation_service_qps, retry_timeout=retry_timeout, ) eval_result.summary_metrics = { k: ("NaN" if isinstance(v, float) and np.isnan(v) else v) for k, v in eval_result.summary_metrics.items() } eval_result.metadata = { "experiment": self._experiment, "experiment_run": experiment_run_name, } try: vertexai.preview.log_metrics(eval_result.summary_metrics) except (TypeError, exceptions.InvalidArgument) as e: _LOGGER.warning(f"Experiment metrics logging failed: {str(e)}") return eval_result def evaluate( self, *, model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None, prompt_template: Optional[str] = None, experiment_run_name: Optional[str] = None, response_column_name: Optional[str] = None, baseline_model_response_column_name: Optional[str] = None, evaluation_service_qps: Optional[float] = None, retry_timeout: float = 120.0, output_file_name: Optional[str] = None, ) -> EvalResult: """Runs an evaluation for the EvalTask. Args: model: A GenerativeModel instance or a custom model function to generate responses to evaluate. If not provided, the evaluation can be performed in the bring-your-own-response (BYOR) mode. prompt_template: The prompt template to use for the evaluation. If not set, the prompt template that was used to create the EvalTask will be used. experiment_run_name: The name of the experiment run to log the evaluation to if an experiment is set for this EvalTask. If not provided, a random unique experiment run name is used. response_column_name: The column name of model response in the dataset. If provided, this will override the `metric_column_mapping` of the `EvalTask`. baseline_model_response_column_name: The column name of baseline model response in the dataset for pairwise metrics. If provided, this will override the `metric_column_mapping` of the `EvalTask` evaluation_service_qps: The custom QPS limit for the evaluation service. retry_timeout: How long to keep retrying the evaluation requests for the whole evaluation dataset, in seconds. output_file_name: The file name with csv suffix to store the output metrics_table. Returns: The evaluation result. """ global_experiment_name = metadata._experiment_tracker.experiment_name if experiment_run_name and not self._experiment and not global_experiment_name: raise ValueError( "Experiment is not set. Please initialize `EvalTask` with an" " experiment, or initialize a global experiment with " "`vertexai.init(experiment='experiment_name')`for logging this" " evaluation run." ) if self.output_uri_prefix and not output_file_name: output_file_name = f"eval_results_{utils.timestamped_unique_name()}.csv" self._verify_and_set_response_column_name( response_column_name=response_column_name, metric_column_mapping_key=constants.Dataset.MODEL_RESPONSE_COLUMN, ) self._verify_and_set_response_column_name( response_column_name=baseline_model_response_column_name, metric_column_mapping_key=constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN, ) experiment_run_name = experiment_run_name or f"{uuid.uuid4()}" if self._experiment and global_experiment_name: metadata._experiment_tracker.set_experiment( experiment=self._experiment, backing_tensorboard=False ) eval_result = self._evaluate_with_experiment( model=model, prompt_template=prompt_template, experiment_run_name=experiment_run_name, evaluation_service_qps=evaluation_service_qps, retry_timeout=retry_timeout, output_file_name=output_file_name, ) metadata._experiment_tracker.set_experiment( experiment=global_experiment_name, backing_tensorboard=False, display_button=False, ) elif self._experiment and not global_experiment_name: metadata._experiment_tracker.set_experiment( experiment=self._experiment, backing_tensorboard=False ) eval_result = self._evaluate_with_experiment( model=model, prompt_template=prompt_template, experiment_run_name=experiment_run_name, evaluation_service_qps=evaluation_service_qps, retry_timeout=retry_timeout, output_file_name=output_file_name, ) metadata._experiment_tracker.reset() elif not self._experiment and global_experiment_name: eval_result = self._evaluate_with_experiment( model=model, prompt_template=prompt_template, experiment_run_name=experiment_run_name, evaluation_service_qps=evaluation_service_qps, retry_timeout=retry_timeout, output_file_name=output_file_name, ) else: eval_result = _evaluation.evaluate( dataset=self.dataset, metrics=self.metrics, model=model, prompt_template=prompt_template, metric_column_mapping=self._metric_column_mapping, evaluation_service_qps=evaluation_service_qps, retry_timeout=retry_timeout, ) candidate_model_name = None if isinstance(model, generative_models.GenerativeModel): candidate_model_name = model._model_name baseline_model_name = None pairwise_metrics = [ metric for metric in self.metrics if isinstance(metric, pairwise_metric.PairwiseMetric) ] if pairwise_metrics: # All pairwise metrics should have the same baseline model. baseline_model = pairwise_metrics[0].baseline_model if isinstance(baseline_model, generative_models.GenerativeModel): baseline_model_name = baseline_model._model_name dataset_uri = None if isinstance(self._raw_dataset, str): dataset_uri = self._raw_dataset eval_utils.upload_evaluation_results( eval_result, self.output_uri_prefix, output_file_name, candidate_model_name, baseline_model_name, dataset_uri, self.metrics, ) return eval_result def _validate_experiment_run(self) -> None: """Checks if an experiment run already exists.""" if metadata._experiment_tracker.experiment_run: raise ValueError( "Experiment run already exists. Please specify the name of the" " experiment run to assign current session within this evaluation." ) def _log_eval_experiment_param( self, model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None, prompt_template: Optional[str] = None, output_file_name: Optional[str] = None, ) -> None: """Logs variable input parameters of an evaluation to an experiment run.""" eval_metadata = {} if prompt_template is not None: eval_metadata.update({"prompt_template": prompt_template}) if isinstance(model, GenerativeModel): eval_metadata.update( { "model_name": model._model_name, } ) if model._generation_config and isinstance(model._generation_config, dict): eval_metadata.update(**model._generation_config) if model._safety_settings and isinstance(model._safety_settings, dict): safety_settings = model._safety_settings safety_settings_as_str = { category.name: threshold.name for category, threshold in safety_settings.items() } eval_metadata.update(safety_settings_as_str) if self.output_uri_prefix and output_file_name: eval_metadata.update( {"output_file": self.output_uri_prefix + "/" + output_file_name} ) if eval_metadata: _LOGGER.info(f"Logging Eval Experiment metadata: {eval_metadata}") try: vertexai.preview.log_params(eval_metadata) except (ValueError, TypeError) as e: _LOGGER.warning(f"Experiment metadata logging failed: {str(e)}") def _verify_and_set_response_column_name( self, response_column_name: str, metric_column_mapping_key: str ) -> None: """Verifies and sets the model response column names.""" if response_column_name: if response_column_name in self._dataset.columns: self._metric_column_mapping[ metric_column_mapping_key ] = response_column_name else: raise ValueError( f"(Baseline) Model response column {response_column_name} is not" " found in the dataset." ) def display_runs(self): """Displays experiment runs associated with this EvalTask.""" if not self._experiment: raise ValueError("Experiment is not set.") elif IPython_display: IPython_display.display( vertexai.preview.get_experiment_df(self._experiment) )