# -*- coding: utf-8 -*- # Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """Model-based Pairwise Metric.""" from typing import Callable, Optional, Union from vertexai import generative_models from vertexai.evaluation.metrics import _base from vertexai.evaluation.metrics import ( metric_prompt_template as metric_prompt_template_base, ) class PairwiseMetric(_base._ModelBasedMetric): # pylint: disable=protected-access """A Model-based Pairwise Metric. A model-based evaluation metric that compares two generative models' responses side-by-side, and allows users to A/B test their generative models to determine which model is performing better. For more details on when to use pairwise metrics, see [Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#pointwise_versus_pairwise). Result Details: * In `EvalResult.summary_metrics`, win rates for both the baseline and candidate model are computed. The win rate is computed as proportion of wins of one model's responses to total attempts as a decimal value between 0 and 1. * In `EvalResult.metrics_table`, a pairwise metric produces two evaluation results per dataset row: * `pairwise_choice`: The choice shows whether the candidate model or the baseline model performs better, or if they are equally good. * `explanation`: The rationale behind each verdict using chain-of-thought reasoning. The explanation helps users scrutinize the judgment and builds appropriate trust in the decisions. See [documentation page](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#understand-results) for more details on understanding the metric results. Usage Examples: ``` baseline_model = GenerativeModel("gemini-1.0-pro") candidate_model = GenerativeModel("gemini-1.5-pro") pairwise_groundedness = PairwiseMetric( metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template( "pairwise_groundedness" ), baseline_model=baseline_model, ) eval_dataset = pd.DataFrame({ "prompt" : [...], }) pairwise_task = EvalTask( dataset=eval_dataset, metrics=[pairwise_groundedness], experiment="my-pairwise-experiment", ) pairwise_result = pairwise_task.evaluate( model=candidate_model, experiment_run_name="gemini-pairwise-eval-run", ) ``` """ def __init__( self, *, metric: str, metric_prompt_template: Union[ metric_prompt_template_base.PairwiseMetricPromptTemplate, str ], baseline_model: Optional[ Union[generative_models.GenerativeModel, Callable[[str], str]] ] = None, ): """Initializes a pairwise evaluation metric. Args: metric: The pairwise evaluation metric name. metric_prompt_template: Pairwise metric prompt template for performing the pairwise model-based evaluation. A freeform string is also accepted. baseline_model: The baseline model for side-by-side comparison. If not specified, `baseline_model_response` column is required in the dataset to perform bring-your-own-response(BYOR) evaluation. """ super().__init__( metric_prompt_template=metric_prompt_template, metric=metric, ) self._baseline_model = baseline_model @property def baseline_model( self, ) -> Union[generative_models.GenerativeModel, Callable[[str], str]]: return self._baseline_model