Files
evo-ai/.venv/lib/python3.10/site-packages/vertexai/evaluation/metrics/pairwise_metric.py
2025-04-25 15:30:54 -03:00

117 lines
4.3 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Model-based Pairwise Metric."""
from typing import Callable, Optional, Union
from vertexai import generative_models
from vertexai.evaluation.metrics import _base
from vertexai.evaluation.metrics import (
metric_prompt_template as metric_prompt_template_base,
)
class PairwiseMetric(_base._ModelBasedMetric): # pylint: disable=protected-access
"""A Model-based Pairwise Metric.
A model-based evaluation metric that compares two generative models' responses
side-by-side, and allows users to A/B test their generative models to
determine which model is performing better.
For more details on when to use pairwise metrics, see
[Evaluation methods and
metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#pointwise_versus_pairwise).
Result Details:
* In `EvalResult.summary_metrics`, win rates for both the baseline and
candidate model are computed. The win rate is computed as proportion of
wins of one model's responses to total attempts as a decimal value
between 0 and 1.
* In `EvalResult.metrics_table`, a pairwise metric produces two
evaluation results per dataset row:
* `pairwise_choice`: The choice shows whether the candidate model or
the baseline model performs better, or if they are equally good.
* `explanation`: The rationale behind each verdict using
chain-of-thought reasoning. The explanation helps users scrutinize
the judgment and builds appropriate trust in the decisions.
See [documentation
page](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#understand-results)
for more details on understanding the metric results.
Usage Examples:
```
baseline_model = GenerativeModel("gemini-1.0-pro")
candidate_model = GenerativeModel("gemini-1.5-pro")
pairwise_groundedness = PairwiseMetric(
metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template(
"pairwise_groundedness"
),
baseline_model=baseline_model,
)
eval_dataset = pd.DataFrame({
"prompt" : [...],
})
pairwise_task = EvalTask(
dataset=eval_dataset,
metrics=[pairwise_groundedness],
experiment="my-pairwise-experiment",
)
pairwise_result = pairwise_task.evaluate(
model=candidate_model,
experiment_run_name="gemini-pairwise-eval-run",
)
```
"""
def __init__(
self,
*,
metric: str,
metric_prompt_template: Union[
metric_prompt_template_base.PairwiseMetricPromptTemplate, str
],
baseline_model: Optional[
Union[generative_models.GenerativeModel, Callable[[str], str]]
] = None,
):
"""Initializes a pairwise evaluation metric.
Args:
metric: The pairwise evaluation metric name.
metric_prompt_template: Pairwise metric prompt template for performing
the pairwise model-based evaluation. A freeform string is also accepted.
baseline_model: The baseline model for side-by-side comparison. If not
specified, `baseline_model_response` column is required in the dataset
to perform bring-your-own-response(BYOR) evaluation.
"""
super().__init__(
metric_prompt_template=metric_prompt_template,
metric=metric,
)
self._baseline_model = baseline_model
@property
def baseline_model(
self,
) -> Union[generative_models.GenerativeModel, Callable[[str], str]]:
return self._baseline_model