structure saas with tools
This commit is contained in:
@@ -0,0 +1,72 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2024 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""Evaluation Metrics Module."""
|
||||
|
||||
from vertexai.preview.evaluation.metrics import _base
|
||||
from vertexai.preview.evaluation.metrics import _rouge
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
_trajectory_single_tool_use,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
custom_output_config,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
metric_prompt_template,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
metric_prompt_template_examples,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import pairwise_metric
|
||||
from vertexai.preview.evaluation.metrics import pointwise_metric
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
predefined_rubric_metrics,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
rubric_based_metric,
|
||||
)
|
||||
|
||||
|
||||
PairwiseMetric = pairwise_metric.PairwiseMetric
|
||||
PointwiseMetric = pointwise_metric.PointwiseMetric
|
||||
CustomMetric = _base.CustomMetric
|
||||
PairwiseMetricPromptTemplate = metric_prompt_template.PairwiseMetricPromptTemplate
|
||||
PointwiseMetricPromptTemplate = metric_prompt_template.PointwiseMetricPromptTemplate
|
||||
MetricPromptTemplateExamples = (
|
||||
metric_prompt_template_examples.MetricPromptTemplateExamples
|
||||
)
|
||||
Rouge = _rouge.Rouge
|
||||
TrajectorySingleToolUse = _trajectory_single_tool_use.TrajectorySingleToolUse
|
||||
CustomOutputConfig = custom_output_config.CustomOutputConfig
|
||||
RubricBasedMetric = rubric_based_metric.RubricBasedMetric
|
||||
RubricGenerationConfig = _base.RubricGenerationConfig
|
||||
PredefinedRubricMetrics = predefined_rubric_metrics.PredefinedRubricMetrics
|
||||
|
||||
|
||||
__all__ = [
|
||||
"CustomMetric",
|
||||
"PairwiseMetric",
|
||||
"PointwiseMetric",
|
||||
"PairwiseMetricPromptTemplate",
|
||||
"PointwiseMetricPromptTemplate",
|
||||
"MetricPromptTemplateExamples",
|
||||
"Rouge",
|
||||
"TrajectorySingleToolUse",
|
||||
"CustomOutputConfig",
|
||||
"RubricBasedMetric",
|
||||
"RubricGenerationConfig",
|
||||
"PredefinedRubricMetrics",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,168 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2024 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""Base classes for evaluation metrics."""
|
||||
|
||||
import abc
|
||||
from typing import Any, Callable, Dict, Literal, Optional, Union, List
|
||||
|
||||
from google.cloud.aiplatform_v1beta1.types import (
|
||||
evaluation_service as gapic_eval_service_types,
|
||||
)
|
||||
from vertexai import generative_models
|
||||
from vertexai.preview.evaluation import constants
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
custom_output_config as custom_output_config_class,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
metric_prompt_template as metric_prompt_template_base,
|
||||
)
|
||||
|
||||
|
||||
_ModelType = Union[generative_models.GenerativeModel, Callable[[str], str]]
|
||||
|
||||
|
||||
class _Metric(abc.ABC):
|
||||
"""The abstract class for evaluation metric."""
|
||||
|
||||
def __init__(self, metric: str):
|
||||
self._metric = metric
|
||||
|
||||
def __str__(self):
|
||||
return self.metric_name
|
||||
|
||||
@property
|
||||
def metric_name(self) -> str:
|
||||
return self._metric
|
||||
|
||||
|
||||
class _ModelBasedMetric(_Metric):
|
||||
"""A Model-based Metric.
|
||||
|
||||
An evaluation metric that evaluates generative AI model responses with
|
||||
another generative model as a judge. This metric can be used to evaluate a
|
||||
single model, or two models side-by-side.
|
||||
|
||||
For more details on when to use model-based metrics, see
|
||||
[Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
metric: str,
|
||||
metric_prompt_template: Union[
|
||||
metric_prompt_template_base.PointwiseMetricPromptTemplate,
|
||||
metric_prompt_template_base.PairwiseMetricPromptTemplate,
|
||||
str,
|
||||
],
|
||||
system_instruction: Optional[str] = None,
|
||||
autorater_config: Optional[gapic_eval_service_types.AutoraterConfig] = None,
|
||||
custom_output_config: Optional[
|
||||
custom_output_config_class.CustomOutputConfig
|
||||
] = None,
|
||||
):
|
||||
"""Initializes the model-based evaluation metric.
|
||||
|
||||
Args:
|
||||
metric: Generic model based metric name.
|
||||
metric_prompt_template: A metric prompt template for performing
|
||||
the model-based evaluation. A freeform string is also accepted.
|
||||
system_instruction: The system instruction to be used in the metric
|
||||
prompt.
|
||||
autorater_config: The config for judge model.
|
||||
custom_output_config: Config for custom output from the judge model.
|
||||
"""
|
||||
super().__init__(metric=metric)
|
||||
self.metric_prompt_template = str(metric_prompt_template)
|
||||
self.system_instruction = system_instruction
|
||||
self.autorater_config = autorater_config
|
||||
self.custom_output_config = custom_output_config
|
||||
|
||||
|
||||
class CustomMetric(_Metric):
|
||||
"""The custom evaluation metric.
|
||||
|
||||
A fully-customized CustomMetric that can be used to evaluate a single model
|
||||
by defining a metric function for a computation-based metric. The
|
||||
CustomMetric is computed on the client-side using the user-defined metric
|
||||
function in SDK only, not by the Vertex Gen AI Evaluation Service.
|
||||
|
||||
Attributes:
|
||||
name: The name of the metric.
|
||||
metric_function: The user-defined evaluation function to compute a metric
|
||||
score. Must use the dataset row dictionary as the metric function
|
||||
input and return per-instance metric result as a dictionary output.
|
||||
The metric score must mapped to the name of the CustomMetric as key.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
metric_function: Callable[
|
||||
[Dict[str, Any]],
|
||||
Dict[str, Any],
|
||||
],
|
||||
):
|
||||
"""Initializes the evaluation metric."""
|
||||
super().__init__(name)
|
||||
self.name = name
|
||||
self.metric_function = metric_function
|
||||
|
||||
|
||||
class _AutomaticMetric(_Metric):
|
||||
"""An automatic metric that computes deterministic score based on reference.
|
||||
|
||||
An lexicon-based evaluation metric that evaluate a generative model's
|
||||
response on the given evaluation task with reference ground truth answers.
|
||||
It is a type of pointwise evaluation metric.
|
||||
|
||||
For more details on when to use automatic metrics, see
|
||||
[Evaluation methods and
|
||||
metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
metric: Literal[constants.Metric.ROUGE],
|
||||
):
|
||||
"""Initializes the automatic evaluation metric.
|
||||
|
||||
Args:
|
||||
metric: The automatic evaluation metric name.
|
||||
"""
|
||||
super().__init__(metric=metric)
|
||||
|
||||
|
||||
class RubricGenerationConfig:
|
||||
"""The rubric generation config."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
prompt_template: str,
|
||||
model: Optional[_ModelType] = None,
|
||||
parsing_fn: Optional[Callable[[str], List[str]]] = None,
|
||||
):
|
||||
"""Initializes the rubric generation config.
|
||||
|
||||
Args:
|
||||
prompt_template: The prompt template for rubric generation.
|
||||
model: The model to use for rubric generation.
|
||||
parsing_fn: The function to parse the rubric generation response.
|
||||
"""
|
||||
self.prompt_template = prompt_template
|
||||
self.model = model
|
||||
self.parsing_fn = parsing_fn
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,802 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2024 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""Library for metrics computation with Gen AI Evaluation Service."""
|
||||
|
||||
import json
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
from google import api_core
|
||||
from google.cloud.aiplatform import base
|
||||
from google.cloud.aiplatform import initializer
|
||||
from google.cloud.aiplatform_v1beta1.services import (
|
||||
evaluation_service as gapic_evaluation_services,
|
||||
)
|
||||
from google.cloud.aiplatform_v1beta1.types import (
|
||||
evaluation_service as gapic_eval_service_types,
|
||||
)
|
||||
from vertexai.preview.evaluation import _base as eval_base
|
||||
from vertexai.preview.evaluation import constants
|
||||
from vertexai.preview.evaluation import multimodal_utils
|
||||
from vertexai.preview.evaluation import (
|
||||
prompt_template as prompt_template_base,
|
||||
)
|
||||
from vertexai.preview.evaluation import utils
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
_base as metrics_base,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
_default_templates,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import _rouge
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
_trajectory_single_tool_use,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
custom_output_config as custom_output_config_class,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import pairwise_metric
|
||||
from vertexai.preview.evaluation.metrics import pointwise_metric
|
||||
from google.protobuf import json_format
|
||||
|
||||
|
||||
_LOGGER = base.Logger(__name__)
|
||||
_METRIC_NAME_TO_METRIC_SPEC = {
|
||||
# Automatic Metrics.
|
||||
constants.Metric.EXACT_MATCH: (gapic_eval_service_types.ExactMatchSpec()),
|
||||
constants.Metric.BLEU: gapic_eval_service_types.BleuSpec(),
|
||||
constants.Metric.ROUGE: gapic_eval_service_types.RougeSpec(),
|
||||
constants.Metric.ROUGE_1: gapic_eval_service_types.RougeSpec(rouge_type="rouge1"),
|
||||
constants.Metric.ROUGE_2: gapic_eval_service_types.RougeSpec(rouge_type="rouge2"),
|
||||
constants.Metric.ROUGE_L: gapic_eval_service_types.RougeSpec(rouge_type="rougeL"),
|
||||
constants.Metric.ROUGE_L_SUM: gapic_eval_service_types.RougeSpec(
|
||||
rouge_type="rougeLsum"
|
||||
),
|
||||
constants.Metric.TOOL_CALL_VALID: (gapic_eval_service_types.ToolCallValidSpec()),
|
||||
constants.Metric.TOOL_NAME_MATCH: (gapic_eval_service_types.ToolNameMatchSpec()),
|
||||
constants.Metric.TOOL_PARAMETER_KV_MATCH: (
|
||||
gapic_eval_service_types.ToolParameterKVMatchSpec()
|
||||
),
|
||||
constants.Metric.TOOL_PARAMETER_KEY_MATCH: (
|
||||
gapic_eval_service_types.ToolParameterKeyMatchSpec()
|
||||
),
|
||||
# Pointwise Metrics.
|
||||
constants.Metric.POINTWISE_METRIC: (gapic_eval_service_types.PointwiseMetricSpec()),
|
||||
# Pairwise Metrics.
|
||||
constants.Metric.PAIRWISE_METRIC: (gapic_eval_service_types.PairwiseMetricSpec()),
|
||||
constants.Metric.RUBRIC_BASED_INSTRUCTION_FOLLOWING: (
|
||||
gapic_eval_service_types.RubricBasedInstructionFollowingSpec()
|
||||
),
|
||||
constants.Metric.TRAJECTORY_EXACT_MATCH: (
|
||||
gapic_eval_service_types.TrajectoryExactMatchSpec()
|
||||
),
|
||||
constants.Metric.TRAJECTORY_IN_ORDER_MATCH: (
|
||||
gapic_eval_service_types.TrajectoryInOrderMatchSpec()
|
||||
),
|
||||
constants.Metric.TRAJECTORY_ANY_ORDER_MATCH: (
|
||||
gapic_eval_service_types.TrajectoryAnyOrderMatchSpec()
|
||||
),
|
||||
constants.Metric.TRAJECTORY_PRECISION: (
|
||||
gapic_eval_service_types.TrajectoryPrecisionSpec()
|
||||
),
|
||||
constants.Metric.TRAJECTORY_RECALL: (
|
||||
gapic_eval_service_types.TrajectoryRecallSpec()
|
||||
),
|
||||
constants.Metric.TRAJECTORY_SINGLE_TOOL_USE: (
|
||||
gapic_eval_service_types.TrajectorySingleToolUseSpec()
|
||||
),
|
||||
}
|
||||
_QUESTION_TEMPLATE = """<question>{question}"""
|
||||
|
||||
|
||||
def _format_rubrics(questions: List[str]) -> str:
|
||||
"""Formats the list of rubrics into a question block."""
|
||||
question_block = "\n".join(
|
||||
_QUESTION_TEMPLATE.format(question=q.strip()) for q in questions
|
||||
)
|
||||
return question_block
|
||||
|
||||
|
||||
def build_custom_output_format_config(
|
||||
custom_output_config: custom_output_config_class.CustomOutputConfig,
|
||||
) -> Union[gapic_eval_service_types.CustomOutputFormatConfig, None]:
|
||||
"""Builds a CustomOutputFormatConfig from user input."""
|
||||
custom_output_cfg = gapic_eval_service_types.CustomOutputFormatConfig()
|
||||
if custom_output_config.return_raw_output:
|
||||
custom_output_cfg.return_raw_output = True
|
||||
return custom_output_cfg
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def build_trajectory(
|
||||
trajectory: Union[str, List[Dict[str, Any]]],
|
||||
) -> gapic_eval_service_types.Trajectory:
|
||||
"""Builds a trajectory from user input."""
|
||||
if not trajectory:
|
||||
return
|
||||
|
||||
if isinstance(trajectory, str):
|
||||
trajectory = json.loads(trajectory)
|
||||
|
||||
if isinstance(trajectory, List):
|
||||
try:
|
||||
tool_calls = []
|
||||
for tool_call_dict in trajectory:
|
||||
tool_input_str = json.dumps(tool_call_dict["tool_input"])
|
||||
tool_calls.append(
|
||||
gapic_eval_service_types.ToolCall(
|
||||
tool_name=tool_call_dict["tool_name"], tool_input=tool_input_str
|
||||
)
|
||||
)
|
||||
return gapic_eval_service_types.Trajectory(tool_calls=tool_calls)
|
||||
except KeyError as e:
|
||||
_LOGGER.error(f"Failed to parse trajectory: {e}")
|
||||
else:
|
||||
_LOGGER.error(
|
||||
f"Unsupported trajectory type: {type(trajectory)}, expected list or"
|
||||
" a JSON array."
|
||||
)
|
||||
|
||||
|
||||
def build_request(
|
||||
metric: Union[str, metrics_base._Metric],
|
||||
row_dict: Dict[str, Any],
|
||||
evaluation_run_config: eval_base.EvaluationRunConfig,
|
||||
) -> gapic_eval_service_types.EvaluateInstancesRequest:
|
||||
"""Builds a metric instance and form the request for the evaluation service.
|
||||
|
||||
Args:
|
||||
metric: The name of the metric to evaluate.
|
||||
row_dict: An evaluation dataset instance as a dictionary.
|
||||
evaluation_run_config: Evaluation run configurations.
|
||||
|
||||
Returns:
|
||||
A single EvaluateInstancesRequest.
|
||||
|
||||
Raises:
|
||||
ValueError: If required request fields are not provided.
|
||||
"""
|
||||
project = initializer.global_config.project
|
||||
location = initializer.global_config.location
|
||||
if not project or not location:
|
||||
raise ValueError(
|
||||
"No project or location specified. Please run `vertexai.init()` to"
|
||||
" provide these parameters."
|
||||
)
|
||||
location_path = (
|
||||
gapic_evaluation_services.EvaluationServiceClient.common_location_path(
|
||||
project, location
|
||||
)
|
||||
)
|
||||
|
||||
if isinstance(metric, pointwise_metric.PointwiseMetric):
|
||||
metric_name = constants.Metric.POINTWISE_METRIC
|
||||
elif isinstance(metric, pairwise_metric.PairwiseMetric):
|
||||
metric_name = constants.Metric.PAIRWISE_METRIC
|
||||
else:
|
||||
metric_name = str(metric)
|
||||
|
||||
try:
|
||||
metric_spec = _METRIC_NAME_TO_METRIC_SPEC[metric_name]
|
||||
except KeyError as e:
|
||||
raise ValueError(f"Metric name: {metric_name} is not supported.") from e
|
||||
|
||||
model_based_metric_instance_input = {}
|
||||
metric_column_mapping = evaluation_run_config.metric_column_mapping
|
||||
if isinstance(
|
||||
metric, metrics_base._ModelBasedMetric # pylint: disable=protected-access
|
||||
):
|
||||
metric_spec.metric_prompt_template = metric.metric_prompt_template
|
||||
metric_spec.system_instruction = metric.system_instruction
|
||||
if metric.custom_output_config:
|
||||
metric_spec.custom_output_format_config = build_custom_output_format_config(
|
||||
metric.custom_output_config
|
||||
)
|
||||
for variable in prompt_template_base.PromptTemplate(
|
||||
metric.metric_prompt_template
|
||||
).variables:
|
||||
model_based_metric_instance_input[variable] = row_dict.get(
|
||||
metric_column_mapping.get(variable),
|
||||
"",
|
||||
)
|
||||
if isinstance(metric, pairwise_metric.PairwiseMetric):
|
||||
metric_column_mapping = evaluation_run_config.metric_column_mapping
|
||||
metric_spec.candidate_response_field_name = metric_column_mapping.get(
|
||||
constants.Dataset.MODEL_RESPONSE_COLUMN,
|
||||
constants.Dataset.MODEL_RESPONSE_COLUMN,
|
||||
)
|
||||
metric_spec.baseline_response_field_name = metric_column_mapping.get(
|
||||
constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
|
||||
constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
|
||||
)
|
||||
elif isinstance(metric, _rouge.Rouge):
|
||||
metric_spec.rouge_type = metric.rouge_type
|
||||
metric_spec.use_stemmer = metric.use_stemmer
|
||||
metric_spec.split_summaries = metric.split_summaries
|
||||
elif isinstance(metric, _trajectory_single_tool_use.TrajectorySingleToolUse):
|
||||
metric_spec.tool_name = metric.tool_name
|
||||
|
||||
response = row_dict.get(
|
||||
metric_column_mapping.get(constants.Dataset.MODEL_RESPONSE_COLUMN), ""
|
||||
)
|
||||
reference = row_dict.get(
|
||||
metric_column_mapping.get(constants.Dataset.REFERENCE_COLUMN), ""
|
||||
)
|
||||
predicted_trajectory = build_trajectory(
|
||||
row_dict.get(
|
||||
metric_column_mapping.get(constants.Dataset.PREDICTED_TRAJECTORY_COLUMN),
|
||||
"",
|
||||
)
|
||||
)
|
||||
reference_trajectory = build_trajectory(
|
||||
row_dict.get(
|
||||
metric_column_mapping.get(constants.Dataset.REFERENCE_TRAJECTORY_COLUMN),
|
||||
"",
|
||||
)
|
||||
)
|
||||
if isinstance(metric, metrics_base._ModelBasedMetric):
|
||||
if metric_spec.metric_prompt_template in (
|
||||
_default_templates.INSTRUCTION_FOLLOWING_RUBRIC_CRITIQUE_TEMPLATE,
|
||||
_default_templates.MULTIMODAL_UNDERSTANDING_RUBRIC_CRITIQUE_TEMPLATE,
|
||||
_default_templates.TEXT_QUALITY_RUBRIC_CRITIQUE_TEMPLATE,
|
||||
_default_templates.PAIRWISE_INSTRUCTION_FOLLOWING_RUBRIC_CRITIQUE_TEMPLATE,
|
||||
_default_templates.PAIRWISE_MULTIMODAL_UNDERSTANDING_RUBRIC_CRITIQUE_TEMPLATE,
|
||||
_default_templates.PAIRWISE_TEXT_QUALITY_RUBRIC_CRITIQUE_TEMPLATE,
|
||||
):
|
||||
model_based_metric_instance_input[
|
||||
constants.Dataset.RUBRICS_COLUMN
|
||||
] = _format_rubrics(
|
||||
model_based_metric_instance_input[constants.Dataset.RUBRICS_COLUMN]
|
||||
)
|
||||
if (
|
||||
constants.Dataset.RUBRICS_COLUMN in model_based_metric_instance_input
|
||||
and isinstance(
|
||||
model_based_metric_instance_input[constants.Dataset.RUBRICS_COLUMN],
|
||||
List,
|
||||
)
|
||||
):
|
||||
model_based_metric_instance_input[
|
||||
constants.Dataset.RUBRICS_COLUMN
|
||||
] = "\n".join(
|
||||
model_based_metric_instance_input[constants.Dataset.RUBRICS_COLUMN]
|
||||
)
|
||||
|
||||
if metric_name == constants.Metric.EXACT_MATCH:
|
||||
instance = gapic_eval_service_types.ExactMatchInput(
|
||||
metric_spec=metric_spec,
|
||||
instances=[
|
||||
gapic_eval_service_types.ExactMatchInstance(
|
||||
prediction=response,
|
||||
reference=reference,
|
||||
)
|
||||
],
|
||||
)
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
exact_match_input=instance,
|
||||
)
|
||||
elif metric_name == constants.Metric.BLEU:
|
||||
instance = gapic_eval_service_types.BleuInput(
|
||||
metric_spec=metric_spec,
|
||||
instances=[
|
||||
gapic_eval_service_types.BleuInstance(
|
||||
prediction=response,
|
||||
reference=reference,
|
||||
)
|
||||
],
|
||||
)
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
bleu_input=instance,
|
||||
)
|
||||
elif metric_name in (
|
||||
constants.Metric.ROUGE,
|
||||
constants.Metric.ROUGE_1,
|
||||
constants.Metric.ROUGE_2,
|
||||
constants.Metric.ROUGE_L,
|
||||
constants.Metric.ROUGE_L_SUM,
|
||||
):
|
||||
instance = gapic_eval_service_types.RougeInput(
|
||||
metric_spec=metric_spec,
|
||||
instances=[
|
||||
gapic_eval_service_types.RougeInstance(
|
||||
prediction=response,
|
||||
reference=reference,
|
||||
)
|
||||
],
|
||||
)
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
rouge_input=instance,
|
||||
)
|
||||
elif metric_name == constants.Metric.TOOL_CALL_VALID:
|
||||
instance = gapic_eval_service_types.ToolCallValidInput(
|
||||
metric_spec=metric_spec,
|
||||
instances=[
|
||||
gapic_eval_service_types.ToolCallValidInstance(
|
||||
prediction=response,
|
||||
reference=reference,
|
||||
)
|
||||
],
|
||||
)
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
tool_call_valid_input=instance,
|
||||
)
|
||||
elif metric_name == constants.Metric.TOOL_NAME_MATCH:
|
||||
instance = gapic_eval_service_types.ToolNameMatchInput(
|
||||
metric_spec=metric_spec,
|
||||
instances=[
|
||||
gapic_eval_service_types.ToolNameMatchInstance(
|
||||
prediction=response,
|
||||
reference=reference,
|
||||
)
|
||||
],
|
||||
)
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
tool_name_match_input=instance,
|
||||
)
|
||||
elif metric_name == constants.Metric.TOOL_PARAMETER_KEY_MATCH:
|
||||
instance = gapic_eval_service_types.ToolParameterKeyMatchInput(
|
||||
metric_spec=metric_spec,
|
||||
instances=[
|
||||
gapic_eval_service_types.ToolParameterKeyMatchInstance(
|
||||
prediction=response,
|
||||
reference=reference,
|
||||
)
|
||||
],
|
||||
)
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
tool_parameter_key_match_input=instance,
|
||||
)
|
||||
elif metric_name == constants.Metric.TOOL_PARAMETER_KV_MATCH:
|
||||
instance = gapic_eval_service_types.ToolParameterKVMatchInput(
|
||||
metric_spec=metric_spec,
|
||||
instances=[
|
||||
gapic_eval_service_types.ToolParameterKVMatchInstance(
|
||||
prediction=response,
|
||||
reference=reference,
|
||||
)
|
||||
],
|
||||
)
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
tool_parameter_kv_match_input=instance,
|
||||
)
|
||||
elif metric_name == constants.Metric.POINTWISE_METRIC:
|
||||
if multimodal_utils.is_multimodal_instance(model_based_metric_instance_input):
|
||||
instance = gapic_eval_service_types.PointwiseMetricInput(
|
||||
metric_spec=metric_spec,
|
||||
instance=gapic_eval_service_types.PointwiseMetricInstance(
|
||||
content_map_instance=multimodal_utils.convert_multimodal_response_to_content_map(
|
||||
model_based_metric_instance_input
|
||||
),
|
||||
),
|
||||
)
|
||||
else:
|
||||
instance = gapic_eval_service_types.PointwiseMetricInput(
|
||||
metric_spec=metric_spec,
|
||||
instance=gapic_eval_service_types.PointwiseMetricInstance(
|
||||
json_instance=json.dumps(model_based_metric_instance_input),
|
||||
),
|
||||
)
|
||||
autorater_config = evaluation_run_config.autorater_config
|
||||
if (
|
||||
isinstance(metric, metrics_base._ModelBasedMetric)
|
||||
and metric.autorater_config
|
||||
):
|
||||
autorater_config = metric.autorater_config
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
pointwise_metric_input=instance,
|
||||
autorater_config=autorater_config,
|
||||
)
|
||||
elif metric_name == constants.Metric.PAIRWISE_METRIC:
|
||||
if multimodal_utils.is_multimodal_instance(model_based_metric_instance_input):
|
||||
instance = gapic_eval_service_types.PairwiseMetricInput(
|
||||
metric_spec=metric_spec,
|
||||
instance=gapic_eval_service_types.PairwiseMetricInstance(
|
||||
content_map_instance=multimodal_utils.convert_multimodal_response_to_content_map(
|
||||
model_based_metric_instance_input
|
||||
),
|
||||
),
|
||||
)
|
||||
else:
|
||||
instance = gapic_eval_service_types.PairwiseMetricInput(
|
||||
metric_spec=metric_spec,
|
||||
instance=gapic_eval_service_types.PairwiseMetricInstance(
|
||||
json_instance=json.dumps(model_based_metric_instance_input),
|
||||
),
|
||||
)
|
||||
autorater_config = evaluation_run_config.autorater_config
|
||||
if (
|
||||
isinstance(metric, metrics_base._ModelBasedMetric)
|
||||
and metric.autorater_config
|
||||
):
|
||||
autorater_config = metric.autorater_config
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
pairwise_metric_input=instance,
|
||||
autorater_config=autorater_config,
|
||||
)
|
||||
elif metric_name == constants.Metric.RUBRIC_BASED_INSTRUCTION_FOLLOWING:
|
||||
required_rbif_fields = [
|
||||
constants.Dataset.MODEL_RESPONSE_COLUMN,
|
||||
constants.Dataset.PROMPT_COLUMN,
|
||||
]
|
||||
for field in required_rbif_fields:
|
||||
column_name = metric_column_mapping.get(field)
|
||||
value = row_dict.get(column_name)
|
||||
if value is None and field in required_rbif_fields:
|
||||
raise ValueError(
|
||||
f"Missing required field: `{field}` for "
|
||||
f"{constants.Metric.RUBRIC_BASED_INSTRUCTION_FOLLOWING}."
|
||||
)
|
||||
else:
|
||||
model_based_metric_instance_input[field] = value
|
||||
instance = gapic_eval_service_types.RubricBasedInstructionFollowingInput(
|
||||
metric_spec=metric_spec,
|
||||
instance=gapic_eval_service_types.RubricBasedInstructionFollowingInstance(
|
||||
json_instance=json.dumps(model_based_metric_instance_input),
|
||||
),
|
||||
)
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
rubric_based_instruction_following_input=instance,
|
||||
)
|
||||
elif metric_name == constants.Metric.TRAJECTORY_EXACT_MATCH:
|
||||
instance = gapic_eval_service_types.TrajectoryExactMatchInput(
|
||||
metric_spec=metric_spec,
|
||||
instances=[
|
||||
gapic_eval_service_types.TrajectoryExactMatchInstance(
|
||||
predicted_trajectory=predicted_trajectory,
|
||||
reference_trajectory=reference_trajectory,
|
||||
)
|
||||
],
|
||||
)
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
trajectory_exact_match_input=instance,
|
||||
)
|
||||
elif metric_name == constants.Metric.TRAJECTORY_IN_ORDER_MATCH:
|
||||
instance = gapic_eval_service_types.TrajectoryInOrderMatchInput(
|
||||
metric_spec=metric_spec,
|
||||
instances=[
|
||||
gapic_eval_service_types.TrajectoryInOrderMatchInstance(
|
||||
predicted_trajectory=predicted_trajectory,
|
||||
reference_trajectory=reference_trajectory,
|
||||
)
|
||||
],
|
||||
)
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
trajectory_in_order_match_input=instance,
|
||||
)
|
||||
elif metric_name == constants.Metric.TRAJECTORY_ANY_ORDER_MATCH:
|
||||
instance = gapic_eval_service_types.TrajectoryAnyOrderMatchInput(
|
||||
metric_spec=metric_spec,
|
||||
instances=[
|
||||
gapic_eval_service_types.TrajectoryAnyOrderMatchInstance(
|
||||
predicted_trajectory=predicted_trajectory,
|
||||
reference_trajectory=reference_trajectory,
|
||||
)
|
||||
],
|
||||
)
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
trajectory_any_order_match_input=instance,
|
||||
)
|
||||
elif metric_name == constants.Metric.TRAJECTORY_PRECISION:
|
||||
instance = gapic_eval_service_types.TrajectoryPrecisionInput(
|
||||
metric_spec=metric_spec,
|
||||
instances=[
|
||||
gapic_eval_service_types.TrajectoryPrecisionInstance(
|
||||
predicted_trajectory=predicted_trajectory,
|
||||
reference_trajectory=reference_trajectory,
|
||||
)
|
||||
],
|
||||
)
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
trajectory_precision_input=instance,
|
||||
)
|
||||
elif metric_name == constants.Metric.TRAJECTORY_RECALL:
|
||||
instance = gapic_eval_service_types.TrajectoryRecallInput(
|
||||
metric_spec=metric_spec,
|
||||
instances=[
|
||||
gapic_eval_service_types.TrajectoryRecallInstance(
|
||||
predicted_trajectory=predicted_trajectory,
|
||||
reference_trajectory=reference_trajectory,
|
||||
)
|
||||
],
|
||||
)
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
trajectory_recall_input=instance,
|
||||
)
|
||||
elif metric_name == constants.Metric.TRAJECTORY_SINGLE_TOOL_USE:
|
||||
instance = gapic_eval_service_types.TrajectorySingleToolUseInput(
|
||||
metric_spec=metric_spec,
|
||||
instances=[
|
||||
gapic_eval_service_types.TrajectorySingleToolUseInstance(
|
||||
predicted_trajectory=predicted_trajectory,
|
||||
)
|
||||
],
|
||||
)
|
||||
return gapic_eval_service_types.EvaluateInstancesRequest(
|
||||
location=location_path,
|
||||
trajectory_single_tool_use_input=instance,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown metric type: {metric_name}")
|
||||
|
||||
|
||||
def _parse_autometric_results(
|
||||
metric_result_dict: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
"""Parses the automatic metric results from the evaluation results.
|
||||
|
||||
Args:
|
||||
metric_result_dict: The metric results dictionary.
|
||||
|
||||
Returns:
|
||||
A dictionary containing metric score of the metric.
|
||||
"""
|
||||
for value in metric_result_dict.values():
|
||||
return {
|
||||
constants.MetricResult.SCORE_KEY: value[0].get(
|
||||
constants.MetricResult.SCORE_KEY
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
def _parse_pointwise_results(
|
||||
metric_result_dict: Dict[str, Any],
|
||||
metric: Union[str, metrics_base._Metric],
|
||||
) -> Dict[str, Any]:
|
||||
"""Parses the model-based pointwise metric results from the evaluation results.
|
||||
|
||||
Args:
|
||||
metric_result_dict: The metric results dictionary.
|
||||
metric: The metric to evaluate.
|
||||
|
||||
Returns:
|
||||
One of the following:
|
||||
1. A dictionary containing raw outputs from the judge model if
|
||||
return_raw_output is set to True in custom_output_config.
|
||||
2. A dictionary containing metric score and explanation of the
|
||||
metric if custom_output_config is not set.
|
||||
"""
|
||||
if (
|
||||
isinstance(metric, pointwise_metric.PointwiseMetric)
|
||||
and getattr(metric, "custom_output_config", None)
|
||||
and getattr(metric.custom_output_config, "return_raw_output", False)
|
||||
):
|
||||
raw_outputs = (
|
||||
metric_result_dict.get(constants.MetricResult.CUSTOM_OUTPUT_KEY)
|
||||
.get(constants.MetricResult.RAW_OUTPUTS_KEY)
|
||||
.get(constants.MetricResult.RAW_OUTPUT_KEY)
|
||||
)
|
||||
if (
|
||||
isinstance(metric, pointwise_metric.PointwiseMetric)
|
||||
and getattr(metric, "custom_output_config", None)
|
||||
and getattr(metric.custom_output_config, "parsing_fn", None)
|
||||
):
|
||||
parsing_fn = metric.custom_output_config.parsing_fn
|
||||
return parsing_fn(raw_outputs)
|
||||
return {constants.MetricResult.RAW_OUTPUT_KEY: raw_outputs}
|
||||
else:
|
||||
return {
|
||||
constants.MetricResult.SCORE_KEY: metric_result_dict.get(
|
||||
constants.MetricResult.SCORE_KEY
|
||||
),
|
||||
constants.MetricResult.EXPLANATION_KEY: metric_result_dict.get(
|
||||
constants.MetricResult.EXPLANATION_KEY
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _parse_pairwise_results(
|
||||
metric_result_dict: Dict[str, Any],
|
||||
metric: Union[str, metrics_base._Metric],
|
||||
) -> Dict[str, Any]:
|
||||
"""Parses the pairwise metric results from the evaluation results.
|
||||
|
||||
Args:
|
||||
metric_result_dict: The metric results dictionary.
|
||||
metric: The metric to evaluate.
|
||||
|
||||
Returns:
|
||||
One of the following:
|
||||
1. A dictionary containing raw outputs from the judge model if
|
||||
return_raw_output is set to True in custom_output_config.
|
||||
2. A dictionary containing metric score and explanation of the
|
||||
metric if custom_output_config is not set.
|
||||
"""
|
||||
if (
|
||||
isinstance(metric, pairwise_metric.PairwiseMetric)
|
||||
and getattr(metric, "custom_output_config", None)
|
||||
and getattr(metric.custom_output_config, "return_raw_output", False)
|
||||
):
|
||||
raw_outputs = (
|
||||
metric_result_dict.get(constants.MetricResult.CUSTOM_OUTPUT_KEY)
|
||||
.get(constants.MetricResult.RAW_OUTPUTS_KEY)
|
||||
.get(constants.MetricResult.RAW_OUTPUT_KEY)
|
||||
)
|
||||
if (
|
||||
isinstance(metric, pairwise_metric.PairwiseMetric)
|
||||
and getattr(metric, "custom_output_config", None)
|
||||
and getattr(metric.custom_output_config, "parsing_fn", None)
|
||||
):
|
||||
parsing_fn = metric.custom_output_config.parsing_fn
|
||||
return parsing_fn(raw_outputs)
|
||||
return {constants.MetricResult.RAW_OUTPUT_KEY: raw_outputs}
|
||||
else:
|
||||
return {
|
||||
constants.MetricResult.PAIRWISE_CHOICE_KEY: metric_result_dict.get(
|
||||
constants.MetricResult.PAIRWISE_CHOICE_KEY,
|
||||
),
|
||||
constants.MetricResult.EXPLANATION_KEY: metric_result_dict.get(
|
||||
constants.MetricResult.EXPLANATION_KEY
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _parse_rubric_based_instruction_following_results(
|
||||
metric_result_dict: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
"""Parses the rubric-based instruction following metric results from the evaluation results.
|
||||
|
||||
Args:
|
||||
metric_result_dict: The metric results dictionary.
|
||||
|
||||
Returns:
|
||||
A dictionary containing a list of rubrics and corresponding verdicts and
|
||||
an overall instruction following score.
|
||||
"""
|
||||
rubric_critique_results = []
|
||||
for rc_result in metric_result_dict["rubric_critique_results"]:
|
||||
if "verdict" not in rc_result:
|
||||
rc_result["verdict"] = False # proto3 shows False bool as unset
|
||||
rubric_critique_results.append(
|
||||
{
|
||||
"rubric": rc_result["rubric"],
|
||||
"verdict": rc_result["verdict"],
|
||||
}
|
||||
)
|
||||
return {
|
||||
constants.MetricResult.RUBRIC_LEVEL_INSTRUCTION_FOLLOWING_KEY: (
|
||||
rubric_critique_results
|
||||
),
|
||||
constants.MetricResult.SCORE_KEY: (
|
||||
metric_result_dict.get(constants.MetricResult.SCORE_KEY)
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def handle_response(
|
||||
response: Union[str, gapic_eval_service_types.EvaluateInstancesResponse],
|
||||
metric: Union[str, metrics_base._Metric],
|
||||
) -> Union[str, Dict[str, Any]]:
|
||||
"""Handles the response from the evaluation service.
|
||||
|
||||
Args:
|
||||
response: The response from the evaluation service.
|
||||
metric: The metric to evaluate to check the output type.
|
||||
|
||||
Returns:
|
||||
A parsed metric result dictionary, or an error message string.
|
||||
"""
|
||||
if isinstance(response, str):
|
||||
return response
|
||||
|
||||
metric_type = response._pb.WhichOneof( # pylint: disable=protected-access
|
||||
"evaluation_results"
|
||||
)
|
||||
|
||||
if metric_type == constants.MetricResult.EXACT_MATCH_RESULTS:
|
||||
metric_result = response.exact_match_results
|
||||
elif metric_type == constants.MetricResult.BLEU_RESULTS:
|
||||
metric_result = response.bleu_results
|
||||
elif metric_type == constants.MetricResult.ROUGE_RESULTS:
|
||||
metric_result = response.rouge_results
|
||||
elif metric_type == constants.MetricResult.TOOL_CALL_VALID_RESULTS:
|
||||
metric_result = response.tool_call_valid_results
|
||||
elif metric_type == constants.MetricResult.TOOL_NAME_MATCH_RESULTS:
|
||||
metric_result = response.tool_name_match_results
|
||||
elif metric_type == constants.MetricResult.TOOL_PARAMETER_KEY_MATCH_RESULTS:
|
||||
metric_result = response.tool_parameter_key_match_results
|
||||
elif metric_type == constants.MetricResult.TOOL_PARAMETER_KV_MATCH_RESULTS:
|
||||
metric_result = response.tool_parameter_kv_match_results
|
||||
elif metric_type == constants.MetricResult.POINTWISE_METRIC_RESULT:
|
||||
metric_result = response.pointwise_metric_result
|
||||
elif metric_type == constants.MetricResult.PAIRWISE_METRIC_RESULT:
|
||||
metric_result = response.pairwise_metric_result
|
||||
elif metric_type == constants.MetricResult.TRAJECTORY_EXACT_MATCH_RESULTS:
|
||||
metric_result = response.trajectory_exact_match_results
|
||||
elif metric_type == constants.MetricResult.TRAJECTORY_IN_ORDER_MATCH_RESULTS:
|
||||
metric_result = response.trajectory_in_order_match_results
|
||||
elif metric_type == constants.MetricResult.TRAJECTORY_ANY_ORDER_MATCH_RESULTS:
|
||||
metric_result = response.trajectory_any_order_match_results
|
||||
elif metric_type == constants.MetricResult.TRAJECTORY_PRECISION_RESULTS:
|
||||
metric_result = response.trajectory_precision_results
|
||||
elif metric_type == constants.MetricResult.TRAJECTORY_RECALL_RESULTS:
|
||||
metric_result = response.trajectory_recall_results
|
||||
elif metric_type == constants.MetricResult.TRAJECTORY_SINGLE_TOOL_USE_RESULTS:
|
||||
metric_result = response.trajectory_single_tool_use_results
|
||||
elif (
|
||||
metric_type == constants.MetricResult.RUBRIC_BASED_INSTRUCTION_FOLLOWING_RESULT
|
||||
):
|
||||
metric_result = response.rubric_based_instruction_following_result
|
||||
else:
|
||||
raise ValueError(f"Unknown metric type: {metric_type}")
|
||||
|
||||
metric_result_dict = json_format.MessageToDict(
|
||||
metric_result._pb, # pylint: disable=protected-access
|
||||
preserving_proto_field_name=True,
|
||||
)
|
||||
if metric_type in (constants.MetricResult.AUTOMATIC_METRIC_RESULTS_LIST):
|
||||
result = _parse_autometric_results(metric_result_dict)
|
||||
elif metric_type == constants.MetricResult.POINTWISE_METRIC_RESULT:
|
||||
result = _parse_pointwise_results(metric_result_dict, metric)
|
||||
elif metric_type == constants.MetricResult.PAIRWISE_METRIC_RESULT:
|
||||
result = _parse_pairwise_results(metric_result_dict, metric)
|
||||
elif (
|
||||
metric_type == constants.MetricResult.RUBRIC_BASED_INSTRUCTION_FOLLOWING_RESULT
|
||||
):
|
||||
result = _parse_rubric_based_instruction_following_results(metric_result_dict)
|
||||
else:
|
||||
raise ValueError(f"Unknown metric type: {metric_type}")
|
||||
return result
|
||||
|
||||
|
||||
def evaluate_instances(
|
||||
client: gapic_evaluation_services.EvaluationServiceClient,
|
||||
request: gapic_eval_service_types.EvaluateInstancesRequest,
|
||||
rate_limiter: utils.RateLimiter,
|
||||
retry_timeout: float,
|
||||
) -> gapic_eval_service_types.EvaluateInstancesResponse:
|
||||
"""Evaluates an instance using Vertex Gen AI Evaluation Service.
|
||||
|
||||
Args:
|
||||
client: The Vertex Gen AI evaluation service client for evaluation.
|
||||
request: An EvaluateInstancesRequest.
|
||||
rate_limiter: The rate limiter for evaluation service requests.
|
||||
retry_timeout: How long to keep retrying the evaluation requests, in seconds.
|
||||
|
||||
Returns:
|
||||
An EvaluateInstancesResponse from Vertex Gen AI Evaluation Service.
|
||||
"""
|
||||
rate_limiter.sleep_and_advance()
|
||||
return client.evaluate_instances(
|
||||
request=request,
|
||||
retry=api_core.retry.Retry(
|
||||
initial=0.250,
|
||||
maximum=90.0,
|
||||
multiplier=1.45,
|
||||
timeout=retry_timeout,
|
||||
predicate=api_core.retry.if_exception_type(
|
||||
api_core.exceptions.Aborted,
|
||||
api_core.exceptions.DeadlineExceeded,
|
||||
api_core.exceptions.ResourceExhausted,
|
||||
api_core.exceptions.ServiceUnavailable,
|
||||
api_core.exceptions.Cancelled,
|
||||
),
|
||||
),
|
||||
)
|
||||
@@ -0,0 +1,79 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2024 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""ROUGE Metric."""
|
||||
|
||||
from typing import Literal
|
||||
from vertexai.preview.evaluation import constants
|
||||
from vertexai.preview.evaluation.metrics import _base
|
||||
|
||||
|
||||
class Rouge(_base._AutomaticMetric): # pylint: disable=protected-access
|
||||
"""The ROUGE Metric.
|
||||
|
||||
Calculates the recall of n-grams in prediction as compared to reference and
|
||||
returns a score ranging between 0 and 1. Supported rouge types are
|
||||
rougen[1-9], rougeL, and rougeLsum.
|
||||
"""
|
||||
|
||||
_metric_name = constants.Metric.ROUGE
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
rouge_type: Literal[
|
||||
"rouge1",
|
||||
"rouge2",
|
||||
"rouge3",
|
||||
"rouge4",
|
||||
"rouge5",
|
||||
"rouge6",
|
||||
"rouge7",
|
||||
"rouge8",
|
||||
"rouge9",
|
||||
"rougeL",
|
||||
"rougeLsum",
|
||||
],
|
||||
use_stemmer: bool = False,
|
||||
split_summaries: bool = False
|
||||
):
|
||||
"""Initializes the ROUGE metric.
|
||||
|
||||
Args:
|
||||
rouge_type: Supported rouge types are rougen[1-9], rougeL, and rougeLsum.
|
||||
use_stemmer: Whether to use stemmer to compute rouge score.
|
||||
split_summaries: Whether to split summaries while using 'rougeLsum' to
|
||||
compute rouge score.
|
||||
"""
|
||||
self._rouge_type = rouge_type
|
||||
self._use_stemmer = use_stemmer
|
||||
self._split_summaries = split_summaries
|
||||
|
||||
super().__init__(
|
||||
metric=Rouge._metric_name,
|
||||
)
|
||||
|
||||
@property
|
||||
def rouge_type(self) -> str:
|
||||
return self._rouge_type
|
||||
|
||||
@property
|
||||
def use_stemmer(self) -> bool:
|
||||
return self._use_stemmer
|
||||
|
||||
@property
|
||||
def split_summaries(self) -> bool:
|
||||
return self._split_summaries
|
||||
@@ -0,0 +1,148 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""Schema for autorater metric configuration."""
|
||||
|
||||
AUTORATER_METRIC_SCHEMA = """
|
||||
$schema: https://json-schema.org/draft/2020-12/schema
|
||||
title: AutoRater Metric Configuration
|
||||
description: A metric definition for model-based evaluation.
|
||||
type: object
|
||||
properties:
|
||||
metadata:
|
||||
description: Useful information about the metric.
|
||||
type: object
|
||||
properties:
|
||||
name:
|
||||
description: Name of the metric.
|
||||
type: string
|
||||
description:
|
||||
description: Description of the metric.
|
||||
type: string
|
||||
author:
|
||||
description: Author of the metric.
|
||||
type: string
|
||||
contact:
|
||||
description: PoC for the metric.
|
||||
type: string
|
||||
version:
|
||||
description: Version of the metric.
|
||||
type: string
|
||||
classification:
|
||||
description: Classification of the metric.
|
||||
type: string
|
||||
enum:
|
||||
- experimental
|
||||
- benchmarked
|
||||
- deprecated
|
||||
required_inputs:
|
||||
description: Input fields used in the metric prompt template.
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
minItems: 1
|
||||
uniqueItems: true
|
||||
benchmarks:
|
||||
description: List of benchmarks used for the metric.
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
dataset:
|
||||
description: Dataset used for benchmarking.
|
||||
type: string
|
||||
results:
|
||||
description: Results from benchmarking.
|
||||
type: string
|
||||
required:
|
||||
- results
|
||||
minItems: 1
|
||||
uniqueItems: true
|
||||
usage:
|
||||
description: Links to documentation or notebooks with example usage.
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
minItems: 1
|
||||
uniqueItems: true
|
||||
required:
|
||||
- name
|
||||
- version
|
||||
- required_inputs
|
||||
steps:
|
||||
description: List of steps used for the autorater workflow.
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
type:
|
||||
description: Type of the step.
|
||||
type: string
|
||||
enum:
|
||||
- pointwise_metric
|
||||
- pairwise_metric
|
||||
- rubric
|
||||
prompt:
|
||||
description: Prompt template for the step.
|
||||
type: object
|
||||
properties:
|
||||
system_instruction:
|
||||
description: System instruction for the model.
|
||||
type: string
|
||||
template:
|
||||
description: Template to populate with inputs from the dataset.
|
||||
type: string
|
||||
required:
|
||||
- template
|
||||
model:
|
||||
description: Configuration of the model for the step.
|
||||
type: object
|
||||
properties:
|
||||
model_name_or_endpoint:
|
||||
description: Name or endpoint of the model.
|
||||
type: string
|
||||
required:
|
||||
- model_name_or_endpoint
|
||||
options:
|
||||
description: Options for the step.
|
||||
type: object
|
||||
properties:
|
||||
sample_count:
|
||||
description: Number of samples for each instance in the dataset.
|
||||
type: integer
|
||||
flip_enabled:
|
||||
description: Whether to flip candidate and baseline responses.
|
||||
type: boolean
|
||||
output:
|
||||
description: Output of the step.
|
||||
type: object
|
||||
properties:
|
||||
type:
|
||||
description: Type of the output.
|
||||
type: string
|
||||
enum:
|
||||
- raw
|
||||
required:
|
||||
- type
|
||||
required:
|
||||
- type
|
||||
- prompt
|
||||
minItems: 1
|
||||
uniqueItems: true
|
||||
required:
|
||||
- metadata
|
||||
- steps
|
||||
"""
|
||||
@@ -0,0 +1,49 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2024 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from vertexai.preview.evaluation import constants
|
||||
from vertexai.preview.evaluation.metrics import _base
|
||||
|
||||
|
||||
class TrajectorySingleToolUse(
|
||||
_base._AutomaticMetric
|
||||
): # pylint: disable=protected-access
|
||||
"""The TrajectorySingleToolUse Metric.
|
||||
|
||||
Evaluates if a tool is present in the trajectory or not.
|
||||
"""
|
||||
|
||||
_metric_name = constants.Metric.TRAJECTORY_SINGLE_TOOL_USE
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tool_name: str,
|
||||
):
|
||||
"""Initializes the TrajectorySingleToolUse metric.
|
||||
|
||||
Args:
|
||||
tool_name: name of the tool to check.
|
||||
"""
|
||||
self._tool_name = tool_name
|
||||
|
||||
super().__init__(
|
||||
metric=TrajectorySingleToolUse._metric_name,
|
||||
)
|
||||
|
||||
@property
|
||||
def tool_name(self) -> str:
|
||||
return self._tool_name
|
||||
@@ -0,0 +1,39 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
"""Custom output config for model-based metrics."""
|
||||
|
||||
from typing import Any, Callable, Dict, Optional
|
||||
|
||||
|
||||
class CustomOutputConfig:
|
||||
"""Custom output config for model-based metrics.
|
||||
|
||||
Attributes:
|
||||
return_raw_output: Whether to return the raw output of the metric
|
||||
function.
|
||||
parsing_fn: Function to parse the raw output of the metric.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
return_raw_output: bool = False,
|
||||
parsing_fn: Optional[Callable[[str], Dict[str, Any]]] = None,
|
||||
):
|
||||
"""Initializes CustomOutputConfig."""
|
||||
self.return_raw_output = return_raw_output
|
||||
self.parsing_fn = parsing_fn
|
||||
@@ -0,0 +1,395 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2024 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""Metric prompt template classes for model-based metrics evaluation."""
|
||||
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from google.cloud.aiplatform import base
|
||||
from vertexai.preview.evaluation import (
|
||||
prompt_template,
|
||||
)
|
||||
|
||||
|
||||
_LOGGER = base.Logger(__name__)
|
||||
_NEWLINE = "\n"
|
||||
|
||||
|
||||
def serialize_dict_in_order(elements: Optional[Dict[str, str]]):
|
||||
"""Serializes dictionary to ordered string value without brackets."""
|
||||
if elements is None:
|
||||
return ""
|
||||
return _NEWLINE.join(f"{key}: {value}" for key, value in sorted(elements.items()))
|
||||
|
||||
|
||||
class _MetricPromptTemplate(prompt_template.PromptTemplate):
|
||||
"""Metric prompt template for generic model-based metrics evaluation."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
criteria: Dict[str, str],
|
||||
rating_rubric: Dict[str, str],
|
||||
input_variables: List[str],
|
||||
instruction: Optional[str] = None,
|
||||
evaluation_steps: Optional[Dict[str, str]] = None,
|
||||
metric_definition: Optional[str] = None,
|
||||
few_shot_examples: Optional[List[str]] = None,
|
||||
):
|
||||
"""Initializes a metric prompt template."""
|
||||
self._input_variables = input_variables
|
||||
|
||||
self._instruction = instruction
|
||||
self._metric_definition = metric_definition
|
||||
self._criteria = criteria
|
||||
self._rating_rubric = rating_rubric
|
||||
self._evaluation_steps = evaluation_steps
|
||||
self._few_shot_examples = few_shot_examples
|
||||
|
||||
self.template = self.__str__()
|
||||
|
||||
@property
|
||||
def prompt_data(self) -> str:
|
||||
return self.template
|
||||
|
||||
|
||||
class PointwiseMetricPromptTemplate(_MetricPromptTemplate):
|
||||
"""Pointwise metric prompt template for pointwise model-based metrics."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
criteria: Dict[str, str],
|
||||
rating_rubric: Dict[str, str],
|
||||
input_variables: Optional[List[str]] = None,
|
||||
instruction: Optional[str] = None,
|
||||
metric_definition: Optional[str] = None,
|
||||
evaluation_steps: Optional[Dict[str, str]] = None,
|
||||
few_shot_examples: Optional[List[str]] = None,
|
||||
):
|
||||
"""Initializes a pointwise metric prompt template.
|
||||
|
||||
Args:
|
||||
criteria: The standards and measures used to evaluate the model
|
||||
responses. It is a dictionary of criterion names and criterion
|
||||
definitions.
|
||||
rating_rubric: A dictionary mapping of rating name and rating
|
||||
definition, used to assign ratings or scores based on specific
|
||||
criteria.
|
||||
input_variables: An optional list of input fields to use in the metric
|
||||
prompt template for generating model-based evaluation results. Model
|
||||
"response" column is included by default. If metric_column_mapping is
|
||||
provided, the mapping values of the input fields will be used to
|
||||
retrieve data from the evaluation dataset.
|
||||
instruction: The general instruction to the model that performs the
|
||||
evaluation. If not provided, a default pointwise metric instruction
|
||||
will be used.
|
||||
metric_definition: The optional metric definition. It is a string
|
||||
describing the metric to be evaluated at a high level. If not
|
||||
provided, this field will not be included in the prompt template.
|
||||
evaluation_steps: The optional gudelines of evaluation steps. A
|
||||
dictionary of evaluation step name and evaluation step definition. If
|
||||
not provided, a default pointwise metric evaluation steps will be
|
||||
used.
|
||||
few_shot_examples: The optional list of few-shot examples to be used in
|
||||
the prompt, to provide the model with demonstrations of how to perform
|
||||
the evaluation, and improve the evaluation accuracy. If not provided,
|
||||
this field will not be included in the prompt template.
|
||||
"""
|
||||
if not input_variables:
|
||||
input_variables = []
|
||||
_LOGGER.info(
|
||||
"The `input_variables` parameter is empty. Only the `response`"
|
||||
" column is used for computing this model-based metric."
|
||||
)
|
||||
input_variables = list(set(input_variables + ["response"]))
|
||||
|
||||
instruction = instruction or self.get_default_pointwise_instruction()
|
||||
|
||||
evaluation_steps = (
|
||||
evaluation_steps or self.get_default_pointwise_evaluation_steps()
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
input_variables=input_variables,
|
||||
criteria=criteria,
|
||||
rating_rubric=rating_rubric,
|
||||
instruction=instruction,
|
||||
metric_definition=metric_definition,
|
||||
evaluation_steps=evaluation_steps,
|
||||
few_shot_examples=few_shot_examples,
|
||||
)
|
||||
|
||||
def get_default_pointwise_instruction(self) -> str:
|
||||
"""Returns the default instruction for the metric prompt template."""
|
||||
|
||||
return (
|
||||
"You are an expert evaluator. Your task is to evaluate the quality of"
|
||||
" the responses generated by AI models. We will provide you with the"
|
||||
" user prompt and an AI-generated responses.\nYou should first read"
|
||||
" the user input carefully for analyzing the task, and then evaluate"
|
||||
" the quality of the responses based on the Criteria provided in the"
|
||||
" Evaluation section below.\nYou will assign the response a rating"
|
||||
" following the Rating Rubric and Evaluation Steps. Give step by step"
|
||||
" explanations for your rating, and only choose ratings from the Rating"
|
||||
" Rubric."
|
||||
)
|
||||
|
||||
def get_default_pointwise_evaluation_steps(self) -> Dict[str, str]:
|
||||
"""Returns the default evaluation steps for the metric prompt template."""
|
||||
return {
|
||||
"Step 1": (
|
||||
"Assess the response in aspects of all criteria provided. Provide"
|
||||
" assessment according to each criterion."
|
||||
),
|
||||
"Step 2": (
|
||||
"Score based on the rating rubric. Give a brief rationale to"
|
||||
" explain your evaluation considering each individual criterion."
|
||||
),
|
||||
}
|
||||
|
||||
def __str__(self):
|
||||
"""Serializes the pointwise metric prompt template to a string."""
|
||||
metric_prompt_template_str = [
|
||||
"# Instruction",
|
||||
f"{self._instruction}",
|
||||
_NEWLINE,
|
||||
"# Evaluation",
|
||||
]
|
||||
if self._metric_definition:
|
||||
metric_prompt_template_str.extend(
|
||||
[
|
||||
"## Metric Definition",
|
||||
f"{self._metric_definition}\n",
|
||||
]
|
||||
)
|
||||
metric_prompt_template_str.extend(
|
||||
[
|
||||
"## Criteria",
|
||||
f"{serialize_dict_in_order(self._criteria)}\n",
|
||||
"## Rating Rubric",
|
||||
f"{serialize_dict_in_order(self._rating_rubric)}\n",
|
||||
]
|
||||
)
|
||||
if self._evaluation_steps:
|
||||
metric_prompt_template_str.extend(
|
||||
[
|
||||
"## Evaluation Steps",
|
||||
f"{serialize_dict_in_order(self._evaluation_steps)}\n",
|
||||
]
|
||||
)
|
||||
if self._few_shot_examples:
|
||||
metric_prompt_template_str.extend(
|
||||
[
|
||||
"## Evaluation Examples",
|
||||
f"{_NEWLINE.join(self._few_shot_examples)}\n",
|
||||
]
|
||||
)
|
||||
metric_prompt_template_str.extend(
|
||||
["\n# User Inputs and AI-generated Response", "## User Inputs"]
|
||||
)
|
||||
for input_variable in self._input_variables:
|
||||
if input_variable == "response":
|
||||
continue
|
||||
metric_prompt_template_str.extend(
|
||||
[
|
||||
f"### {input_variable}",
|
||||
f"{{{input_variable}}}\n",
|
||||
]
|
||||
)
|
||||
metric_prompt_template_str.extend(
|
||||
[
|
||||
_NEWLINE,
|
||||
"\n## AI-generated Response",
|
||||
"{response}",
|
||||
]
|
||||
)
|
||||
return _NEWLINE.join(metric_prompt_template_str)
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f"PointwiseMetricPromptTemplate(prompt_data={self.prompt_data},"
|
||||
f" variables={self.variables})"
|
||||
)
|
||||
|
||||
|
||||
class PairwiseMetricPromptTemplate(_MetricPromptTemplate):
|
||||
"""Pairwise metric prompt template for pairwise model-based metrics."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
criteria: Dict[str, str],
|
||||
rating_rubric: Dict[str, str],
|
||||
input_variables: Optional[List[str]] = None,
|
||||
instruction: Optional[str] = None,
|
||||
metric_definition: Optional[str] = None,
|
||||
evaluation_steps: Optional[Dict[str, str]] = None,
|
||||
few_shot_examples: Optional[List[str]] = None,
|
||||
):
|
||||
"""Initializes a pairwise metric prompt template.
|
||||
|
||||
Args:
|
||||
criteria: The standards and measures used to evaluate the model
|
||||
responses. It is a dictionary of criterion names and criterion
|
||||
definitions.
|
||||
rating_rubric: A dictionary mapping of rating name and rating
|
||||
definition, used to assign ratings or scores based on specific
|
||||
criteria.
|
||||
input_variables: An optional list of input fields to use in the metric
|
||||
prompt template for generating model-based evaluation results.
|
||||
Candidate model "response" column and "baseline_model_response" column
|
||||
are included by default. If metric_column_mapping is provided, the
|
||||
mapping values of the input fields will be used to retrieve data from
|
||||
the evaluation dataset.
|
||||
instruction: The general instruction to the model that performs the
|
||||
evaluation. If not provided, a default pairwise metric instruction
|
||||
will be used.
|
||||
metric_definition: The optional metric definition. It is a string
|
||||
describing the metric to be evaluated at a high level. If not
|
||||
provided, this field will not be included in the prompt template.
|
||||
evaluation_steps: The optional gudelines of evaluation steps. A
|
||||
dictionary of evaluation step name and evaluation step definition. If
|
||||
not provided, a default pairwise metric evaluation steps will be used.
|
||||
few_shot_examples: The optional list of few-shot examples to be used in
|
||||
the prompt, to provide the model with demonstrations of how to perform
|
||||
the evaluation, and improve the evaluation accuracy. If not provided,
|
||||
this field will not be included in the prompt template.
|
||||
"""
|
||||
if not input_variables:
|
||||
input_variables = []
|
||||
_LOGGER.info(
|
||||
"The `input_variables` parameter is empty. Only the `response`"
|
||||
" column and `baseline_model_response` columns are used for"
|
||||
" computing this model-based metric."
|
||||
)
|
||||
input_variables = list(
|
||||
set(input_variables + ["response", "baseline_model_response"])
|
||||
)
|
||||
|
||||
instruction = instruction or self.get_default_pairwise_instruction()
|
||||
|
||||
evaluation_steps = (
|
||||
evaluation_steps or self.get_default_pairwise_evaluation_steps()
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
input_variables=input_variables,
|
||||
criteria=criteria,
|
||||
rating_rubric=rating_rubric,
|
||||
instruction=instruction,
|
||||
metric_definition=metric_definition,
|
||||
evaluation_steps=evaluation_steps,
|
||||
few_shot_examples=few_shot_examples,
|
||||
)
|
||||
|
||||
def get_default_pairwise_instruction(self) -> str:
|
||||
"""Returns the default instruction for the metric prompt template."""
|
||||
|
||||
return (
|
||||
"You are an expert evaluator. Your task is to evaluate the quality of"
|
||||
" the responses generated by two AI models. We will provide you with"
|
||||
" the user input and a pair of AI-generated responses (Response A and"
|
||||
" Response B).\nYou should first read the user input carefully for"
|
||||
" analyzing the task, and then evaluate the quality of the responses"
|
||||
" based on based on the Criteria provided in the Evaluation section"
|
||||
" below.\nYou will first judge responses individually, following the"
|
||||
" Rating Rubric and Evaluation Steps. Then you will give step by step"
|
||||
" explanations for your judgement, compare results to declare the"
|
||||
" winner based on the Rating Rubric and Evaluation Steps."
|
||||
)
|
||||
|
||||
def get_default_pairwise_evaluation_steps(self) -> Dict[str, str]:
|
||||
"""Returns the default evaluation steps for the metric prompt template."""
|
||||
return {
|
||||
"Step 1": "Analyze Response A based on all the Criteria.",
|
||||
"Step 2": "Analyze Response B based on all the Criteria.",
|
||||
"Step 3": (
|
||||
"Compare the overall performance of Response A and Response B based"
|
||||
" on your analyses and assessment."
|
||||
),
|
||||
"Step 4": (
|
||||
'Output your preference of "A", "SAME" or "B" to the'
|
||||
" pairwise_choice field according to the Rating Rubrics."
|
||||
),
|
||||
"Step 5": "Output your assessment reasoning in the explanation field",
|
||||
}
|
||||
|
||||
def __str__(self):
|
||||
"""Serializes the pairwise metric prompt template to a string."""
|
||||
metric_prompt_template_str = [
|
||||
"# Instruction",
|
||||
f"{self._instruction}",
|
||||
_NEWLINE,
|
||||
"# Evaluation",
|
||||
]
|
||||
if self._metric_definition:
|
||||
metric_prompt_template_str.extend(
|
||||
[
|
||||
"## Metric Definition",
|
||||
f"{self._metric_definition}\n",
|
||||
]
|
||||
)
|
||||
metric_prompt_template_str.extend(
|
||||
[
|
||||
"## Criteria",
|
||||
f"{serialize_dict_in_order(self._criteria)}\n",
|
||||
"## Rating Rubric",
|
||||
f"{serialize_dict_in_order(self._rating_rubric)}\n",
|
||||
]
|
||||
)
|
||||
if self._evaluation_steps:
|
||||
metric_prompt_template_str.extend(
|
||||
[
|
||||
"## Evaluation Steps",
|
||||
f"{serialize_dict_in_order(self._evaluation_steps)}\n",
|
||||
]
|
||||
)
|
||||
if self._few_shot_examples:
|
||||
metric_prompt_template_str.extend(
|
||||
[
|
||||
"## Evaluation Examples",
|
||||
f"{_NEWLINE.join(self._few_shot_examples)}\n",
|
||||
]
|
||||
)
|
||||
metric_prompt_template_str.extend(
|
||||
["\n# User Inputs and AI-generated Responses", "## User Inputs"]
|
||||
)
|
||||
for input_variable in self._input_variables:
|
||||
if input_variable in ["response", "baseline_model_response"]:
|
||||
continue
|
||||
metric_prompt_template_str.extend(
|
||||
[
|
||||
f"### {input_variable}",
|
||||
f"{{{input_variable}}}\n",
|
||||
]
|
||||
)
|
||||
metric_prompt_template_str.extend(
|
||||
[
|
||||
"\n## AI-generated Responses",
|
||||
"### Response A",
|
||||
"{baseline_model_response}\n",
|
||||
"### Response B",
|
||||
"{response}",
|
||||
]
|
||||
)
|
||||
return _NEWLINE.join(metric_prompt_template_str)
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f"PairwiseMetricPromptTemplate(prompt_data={self.prompt_data},"
|
||||
f" variables={self.variables})"
|
||||
)
|
||||
@@ -0,0 +1,197 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2024 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""Example metric prompt templates for model-based evaluation."""
|
||||
|
||||
from typing import List
|
||||
|
||||
from vertexai.preview.evaluation import constants
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
_default_templates,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import pairwise_metric
|
||||
from vertexai.preview.evaluation.metrics import pointwise_metric
|
||||
|
||||
|
||||
class MetricPromptTemplateExamples:
|
||||
"""Examples of metric prompt templates for model-based evaluation."""
|
||||
|
||||
_PROMPT_TEMPLATE_MAP = {
|
||||
constants.Metric.COHERENCE: _default_templates.COHERENCE_PROMPT_TEMPLATE,
|
||||
constants.Metric.FLUENCY: _default_templates.FLUENCY_PROMPT_TEMPLATE,
|
||||
constants.Metric.SAFETY: _default_templates.SAFETY_PROMPT_TEMPLATE,
|
||||
constants.Metric.GROUNDEDNESS: (
|
||||
_default_templates.GROUNDEDNESS_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.INSTRUCTION_FOLLOWING: (
|
||||
_default_templates.INSTRUCTION_FOLLOWING_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.VERBOSITY: _default_templates.VERBOSITY_PROMPT_TEMPLATE,
|
||||
constants.Metric.TEXT_QUALITY: (
|
||||
_default_templates.TEXT_QUALITY_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.SUMMARIZATION_QUALITY: (
|
||||
_default_templates.SUMMARIZATION_QUALITY_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.QUESTION_ANSWERING_QUALITY: (
|
||||
_default_templates.QUESTION_ANSWERING_QUALITY_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.MULTI_TURN_CHAT_QUALITY: (
|
||||
_default_templates.MULTI_TURN_CHAT_QUALITY_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.MULTI_TURN_SAFETY: (
|
||||
_default_templates.MULTI_TURN_SAFETY_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.PAIRWISE_COHERENCE: (
|
||||
_default_templates.PAIRWISE_COHERENCE_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.PAIRWISE_FLUENCY: (
|
||||
_default_templates.PAIRWISE_FLUENCY_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.PAIRWISE_SAFETY: (
|
||||
_default_templates.PAIRWISE_SAFETY_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.PAIRWISE_GROUNDEDNESS: (
|
||||
_default_templates.PAIRWISE_GROUNDEDNESS_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.PAIRWISE_INSTRUCTION_FOLLOWING: (
|
||||
_default_templates.PAIRWISE_INSTRUCTION_FOLLOWING_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.PAIRWISE_VERBOSITY: (
|
||||
_default_templates.PAIRWISE_VERBOSITY_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.PAIRWISE_TEXT_QUALITY: (
|
||||
_default_templates.PAIRWISE_TEXT_QUALITY_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.PAIRWISE_SUMMARIZATION_QUALITY: (
|
||||
_default_templates.PAIRWISE_SUMMARIZATION_QUALITY_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.PAIRWISE_QUESTION_ANSWERING_QUALITY: (
|
||||
_default_templates.PAIRWISE_QUESTION_ANSWERING_QUALITY_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.PAIRWISE_MULTI_TURN_CHAT_QUALITY: (
|
||||
_default_templates.PAIRWISE_MULTI_TURN_CHAT_QUALITY_PROMPT_TEMPLATE
|
||||
),
|
||||
constants.Metric.PAIRWISE_MULTI_TURN_SAFETY: (
|
||||
_default_templates.PAIRWISE_MULTI_TURN_SAFETY_PROMPT_TEMPLATE
|
||||
),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def get_prompt_template(cls, metric_name: str) -> str:
|
||||
"""Returns the prompt template for the given metric name."""
|
||||
return cls._PROMPT_TEMPLATE_MAP[metric_name]
|
||||
|
||||
@classmethod
|
||||
def list_example_metric_names(cls) -> List[str]:
|
||||
"""Returns a list of all metric prompt templates."""
|
||||
return list(cls._PROMPT_TEMPLATE_MAP.keys())
|
||||
|
||||
class Pointwise:
|
||||
"""Example PointwiseMetric instances."""
|
||||
|
||||
FLUENCY = pointwise_metric.PointwiseMetric(
|
||||
metric=constants.Metric.FLUENCY,
|
||||
metric_prompt_template=_default_templates.FLUENCY_PROMPT_TEMPLATE,
|
||||
)
|
||||
COHERENCE = pointwise_metric.PointwiseMetric(
|
||||
metric=constants.Metric.COHERENCE,
|
||||
metric_prompt_template=_default_templates.COHERENCE_PROMPT_TEMPLATE,
|
||||
)
|
||||
SAFETY = pointwise_metric.PointwiseMetric(
|
||||
metric=constants.Metric.SAFETY,
|
||||
metric_prompt_template=_default_templates.SAFETY_PROMPT_TEMPLATE,
|
||||
)
|
||||
GROUNDEDNESS = pointwise_metric.PointwiseMetric(
|
||||
metric=constants.Metric.GROUNDEDNESS,
|
||||
metric_prompt_template=_default_templates.GROUNDEDNESS_PROMPT_TEMPLATE,
|
||||
)
|
||||
INSTRUCTION_FOLLOWING = pointwise_metric.PointwiseMetric(
|
||||
metric=constants.Metric.INSTRUCTION_FOLLOWING,
|
||||
metric_prompt_template=_default_templates.INSTRUCTION_FOLLOWING_PROMPT_TEMPLATE,
|
||||
)
|
||||
VERBOSITY = pointwise_metric.PointwiseMetric(
|
||||
metric=constants.Metric.VERBOSITY,
|
||||
metric_prompt_template=_default_templates.VERBOSITY_PROMPT_TEMPLATE,
|
||||
)
|
||||
TEXT_QUALITY = pointwise_metric.PointwiseMetric(
|
||||
metric=constants.Metric.TEXT_QUALITY,
|
||||
metric_prompt_template=_default_templates.TEXT_QUALITY_PROMPT_TEMPLATE,
|
||||
)
|
||||
SUMMARIZATION_QUALITY = pointwise_metric.PointwiseMetric(
|
||||
metric=constants.Metric.SUMMARIZATION_QUALITY,
|
||||
metric_prompt_template=_default_templates.SUMMARIZATION_QUALITY_PROMPT_TEMPLATE,
|
||||
)
|
||||
QUESTION_ANSWERING_QUALITY = pointwise_metric.PointwiseMetric(
|
||||
metric=constants.Metric.QUESTION_ANSWERING_QUALITY,
|
||||
metric_prompt_template=_default_templates.QUESTION_ANSWERING_QUALITY_PROMPT_TEMPLATE,
|
||||
)
|
||||
MULTI_TURN_CHAT_QUALITY = pointwise_metric.PointwiseMetric(
|
||||
metric=constants.Metric.MULTI_TURN_CHAT_QUALITY,
|
||||
metric_prompt_template=_default_templates.MULTI_TURN_CHAT_QUALITY_PROMPT_TEMPLATE,
|
||||
)
|
||||
MULTI_TURN_SAFETY_QUALITY = pointwise_metric.PointwiseMetric(
|
||||
metric=constants.Metric.MULTI_TURN_SAFETY,
|
||||
metric_prompt_template=_default_templates.MULTI_TURN_SAFETY_PROMPT_TEMPLATE,
|
||||
)
|
||||
|
||||
class Pairwise:
|
||||
"""Example PairwiseMetric instances."""
|
||||
|
||||
FLUENCY = pairwise_metric.PairwiseMetric(
|
||||
metric=constants.Metric.PAIRWISE_FLUENCY,
|
||||
metric_prompt_template=_default_templates.PAIRWISE_FLUENCY_PROMPT_TEMPLATE,
|
||||
)
|
||||
COHERENCE = pairwise_metric.PairwiseMetric(
|
||||
metric=constants.Metric.PAIRWISE_COHERENCE,
|
||||
metric_prompt_template=_default_templates.PAIRWISE_COHERENCE_PROMPT_TEMPLATE,
|
||||
)
|
||||
SAFETY = pairwise_metric.PairwiseMetric(
|
||||
metric=constants.Metric.PAIRWISE_SAFETY,
|
||||
metric_prompt_template=_default_templates.PAIRWISE_SAFETY_PROMPT_TEMPLATE,
|
||||
)
|
||||
GROUNDEDNESS = pairwise_metric.PairwiseMetric(
|
||||
metric=constants.Metric.PAIRWISE_GROUNDEDNESS,
|
||||
metric_prompt_template=_default_templates.PAIRWISE_GROUNDEDNESS_PROMPT_TEMPLATE,
|
||||
)
|
||||
INSTRUCTION_FOLLOWING = pairwise_metric.PairwiseMetric(
|
||||
metric=constants.Metric.PAIRWISE_INSTRUCTION_FOLLOWING,
|
||||
metric_prompt_template=_default_templates.PAIRWISE_INSTRUCTION_FOLLOWING_PROMPT_TEMPLATE,
|
||||
)
|
||||
VERBOSITY = pairwise_metric.PairwiseMetric(
|
||||
metric=constants.Metric.PAIRWISE_VERBOSITY,
|
||||
metric_prompt_template=_default_templates.PAIRWISE_VERBOSITY_PROMPT_TEMPLATE,
|
||||
)
|
||||
TEXT_QUALITY = pairwise_metric.PairwiseMetric(
|
||||
metric=constants.Metric.PAIRWISE_TEXT_QUALITY,
|
||||
metric_prompt_template=_default_templates.PAIRWISE_TEXT_QUALITY_PROMPT_TEMPLATE,
|
||||
)
|
||||
SUMMARIZATION_QUALITY = pairwise_metric.PairwiseMetric(
|
||||
metric=constants.Metric.PAIRWISE_SUMMARIZATION_QUALITY,
|
||||
metric_prompt_template=_default_templates.PAIRWISE_SUMMARIZATION_QUALITY_PROMPT_TEMPLATE,
|
||||
)
|
||||
QUESTION_ANSWERING_QUALITY = pairwise_metric.PairwiseMetric(
|
||||
metric=constants.Metric.PAIRWISE_QUESTION_ANSWERING_QUALITY,
|
||||
metric_prompt_template=_default_templates.PAIRWISE_QUESTION_ANSWERING_QUALITY_PROMPT_TEMPLATE,
|
||||
)
|
||||
MULTI_TURN_CHAT_QUALITY = pairwise_metric.PairwiseMetric(
|
||||
metric=constants.Metric.PAIRWISE_MULTI_TURN_CHAT_QUALITY,
|
||||
metric_prompt_template=_default_templates.PAIRWISE_MULTI_TURN_CHAT_QUALITY_PROMPT_TEMPLATE,
|
||||
)
|
||||
MULTI_TURN_SAFETY_QUALITY = pairwise_metric.PairwiseMetric(
|
||||
metric=constants.Metric.PAIRWISE_MULTI_TURN_SAFETY,
|
||||
metric_prompt_template=_default_templates.PAIRWISE_MULTI_TURN_SAFETY_PROMPT_TEMPLATE,
|
||||
)
|
||||
@@ -0,0 +1,133 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2024 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""Model-based Pairwise Metric."""
|
||||
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
from google.cloud.aiplatform_v1beta1.types import (
|
||||
evaluation_service as gapic_eval_service_types,
|
||||
)
|
||||
from vertexai.preview import generative_models
|
||||
from vertexai.preview.evaluation.metrics import _base
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
custom_output_config as custom_output_config_class,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
metric_prompt_template as metric_prompt_template_base,
|
||||
)
|
||||
|
||||
|
||||
class PairwiseMetric(_base._ModelBasedMetric): # pylint: disable=protected-access
|
||||
"""A Model-based Pairwise Metric.
|
||||
|
||||
A model-based evaluation metric that compares two generative models' responses
|
||||
side-by-side, and allows users to A/B test their generative models to
|
||||
determine which model is performing better.
|
||||
|
||||
For more details on when to use pairwise metrics, see
|
||||
[Evaluation methods and
|
||||
metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#pointwise_versus_pairwise).
|
||||
|
||||
Result Details:
|
||||
|
||||
* In `EvalResult.summary_metrics`, win rates for both the baseline and
|
||||
candidate model are computed. The win rate is computed as proportion of
|
||||
wins of one model's responses to total attempts as a decimal value
|
||||
between 0 and 1.
|
||||
|
||||
* In `EvalResult.metrics_table`, a pairwise metric produces two
|
||||
evaluation results per dataset row:
|
||||
* `pairwise_choice`: The choice shows whether the candidate model or
|
||||
the baseline model performs better, or if they are equally good.
|
||||
* `explanation`: The rationale behind each verdict using
|
||||
chain-of-thought reasoning. The explanation helps users scrutinize
|
||||
the judgment and builds appropriate trust in the decisions.
|
||||
|
||||
See [documentation
|
||||
page](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#understand-results)
|
||||
for more details on understanding the metric results.
|
||||
|
||||
Usage Examples:
|
||||
|
||||
```
|
||||
baseline_model = GenerativeModel("gemini-1.0-pro")
|
||||
candidate_model = GenerativeModel("gemini-1.5-pro")
|
||||
|
||||
pairwise_groundedness = PairwiseMetric(
|
||||
metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template(
|
||||
"pairwise_groundedness"
|
||||
),
|
||||
baseline_model=baseline_model,
|
||||
)
|
||||
eval_dataset = pd.DataFrame({
|
||||
"prompt" : [...],
|
||||
})
|
||||
pairwise_task = EvalTask(
|
||||
dataset=eval_dataset,
|
||||
metrics=[pairwise_groundedness],
|
||||
experiment="my-pairwise-experiment",
|
||||
)
|
||||
pairwise_result = pairwise_task.evaluate(
|
||||
model=candidate_model,
|
||||
experiment_run_name="gemini-pairwise-eval-run",
|
||||
)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
metric: str,
|
||||
metric_prompt_template: Union[
|
||||
metric_prompt_template_base.PairwiseMetricPromptTemplate, str
|
||||
],
|
||||
baseline_model: Optional[
|
||||
Union[generative_models.GenerativeModel, Callable[[str], str]]
|
||||
] = None,
|
||||
system_instruction: Optional[str] = None,
|
||||
autorater_config: Optional[gapic_eval_service_types.AutoraterConfig] = None,
|
||||
custom_output_config: Optional[
|
||||
custom_output_config_class.CustomOutputConfig
|
||||
] = None,
|
||||
):
|
||||
"""Initializes a pairwise evaluation metric.
|
||||
|
||||
Args:
|
||||
metric: The pairwise evaluation metric name.
|
||||
metric_prompt_template: Pairwise metric prompt template for performing
|
||||
the pairwise model-based evaluation. A freeform string is also accepted.
|
||||
baseline_model: The baseline model for side-by-side comparison. If not
|
||||
specified, `baseline_model_response` column is required in the dataset
|
||||
to perform bring-your-own-response(BYOR) evaluation.
|
||||
system_instruction: The system instruction for the evaluation.
|
||||
autorater_config: The config for judge model.
|
||||
custom_output_config: Config for custom output from the judge model.
|
||||
"""
|
||||
super().__init__(
|
||||
metric_prompt_template=metric_prompt_template,
|
||||
metric=metric,
|
||||
system_instruction=system_instruction,
|
||||
autorater_config=autorater_config,
|
||||
custom_output_config=custom_output_config,
|
||||
)
|
||||
self._baseline_model = baseline_model
|
||||
|
||||
@property
|
||||
def baseline_model(
|
||||
self,
|
||||
) -> Union[generative_models.GenerativeModel, Callable[[str], str]]:
|
||||
return self._baseline_model
|
||||
@@ -0,0 +1,95 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2024 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""Model-based Pointwise Metric."""
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from google.cloud.aiplatform_v1beta1.types import (
|
||||
evaluation_service as gapic_eval_service_types,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import _base
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
custom_output_config as custom_output_config_class,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
metric_prompt_template as metric_prompt_template_base,
|
||||
)
|
||||
|
||||
|
||||
class PointwiseMetric(_base._ModelBasedMetric): # pylint: disable=protected-access
|
||||
"""A Model-based Pointwise Metric.
|
||||
|
||||
A model-based evaluation metric that evaluate a single generative model's
|
||||
response.
|
||||
|
||||
For more details on when to use model-based pointwise metrics, see
|
||||
[Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval).
|
||||
|
||||
Usage Examples:
|
||||
|
||||
```
|
||||
candidate_model = GenerativeModel("gemini-1.5-pro")
|
||||
eval_dataset = pd.DataFrame({
|
||||
"prompt" : [...],
|
||||
})
|
||||
fluency_metric = PointwiseMetric(
|
||||
metric="fluency",
|
||||
metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template('fluency'),
|
||||
)
|
||||
pointwise_eval_task = EvalTask(
|
||||
dataset=eval_dataset,
|
||||
metrics=[
|
||||
fluency_metric,
|
||||
MetricPromptTemplateExamples.Pointwise.GROUNDEDNESS,
|
||||
],
|
||||
)
|
||||
pointwise_result = pointwise_eval_task.evaluate(
|
||||
model=candidate_model,
|
||||
)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
metric: str,
|
||||
metric_prompt_template: Union[
|
||||
metric_prompt_template_base.PointwiseMetricPromptTemplate, str
|
||||
],
|
||||
system_instruction: Optional[str] = None,
|
||||
autorater_config: Optional[gapic_eval_service_types.AutoraterConfig] = None,
|
||||
custom_output_config: Optional[
|
||||
custom_output_config_class.CustomOutputConfig
|
||||
] = None,
|
||||
):
|
||||
"""Initializes a pointwise evaluation metric.
|
||||
|
||||
Args:
|
||||
metric: The pointwise evaluation metric name.
|
||||
metric_prompt_template: Pointwise metric prompt template for performing
|
||||
the model-based evaluation. A freeform string is also accepted.
|
||||
system_instruction: The system instruction for the evaluation.
|
||||
autorater_config: The config for judge model.
|
||||
custom_output_config: Config for custom output from the judge model.
|
||||
"""
|
||||
super().__init__(
|
||||
metric_prompt_template=metric_prompt_template,
|
||||
metric=metric,
|
||||
system_instruction=system_instruction,
|
||||
autorater_config=autorater_config,
|
||||
custom_output_config=custom_output_config,
|
||||
)
|
||||
@@ -0,0 +1,126 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from google.cloud.aiplatform_v1beta1.types import (
|
||||
evaluation_service as gapic_eval_service_types,
|
||||
)
|
||||
from vertexai.preview.evaluation import utils
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
_base as metrics_base,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
_default_templates,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
custom_output_config,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import pairwise_metric
|
||||
from vertexai.preview.evaluation.metrics import pointwise_metric
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
rubric_based_metric,
|
||||
)
|
||||
|
||||
|
||||
AutoraterConfig = gapic_eval_service_types.AutoraterConfig
|
||||
|
||||
_POINTWISE_OUTPUT_CONFIG = custom_output_config.CustomOutputConfig(
|
||||
return_raw_output=True,
|
||||
parsing_fn=utils.parse_pointwise_rubric_result,
|
||||
)
|
||||
|
||||
_PAIRWISE_OUTPUT_CONFIG = custom_output_config.CustomOutputConfig(
|
||||
return_raw_output=True,
|
||||
parsing_fn=utils.parse_pairwise_rubric_result,
|
||||
)
|
||||
_PAIRWISE_AUTORATER_CONFIG = AutoraterConfig(
|
||||
sampling_count=1,
|
||||
)
|
||||
|
||||
|
||||
class PredefinedRubricMetrics:
|
||||
"""Predefined rubric-based metrics."""
|
||||
|
||||
class Pointwise:
|
||||
"""Pointwise rubric-based metrics."""
|
||||
|
||||
INSTRUCTION_FOLLOWING = rubric_based_metric.RubricBasedMetric(
|
||||
generation_config=metrics_base.RubricGenerationConfig(
|
||||
prompt_template=_default_templates.INSTRUCTION_FOLLOWING_RUBRIC_GENERATION_PROMPT_TEMPLATE,
|
||||
),
|
||||
critique_metric=pointwise_metric.PointwiseMetric(
|
||||
metric="rb_instruction_following",
|
||||
metric_prompt_template=_default_templates.INSTRUCTION_FOLLOWING_RUBRIC_CRITIQUE_TEMPLATE,
|
||||
custom_output_config=_POINTWISE_OUTPUT_CONFIG,
|
||||
),
|
||||
)
|
||||
MULTIMODAL_UNDERSTANDING = rubric_based_metric.RubricBasedMetric(
|
||||
generation_config=metrics_base.RubricGenerationConfig(
|
||||
prompt_template=_default_templates.MULTIMODAL_UNDERSTANDING_RUBRIC_GENERATION_PROMPT_TEMPLATE
|
||||
),
|
||||
critique_metric=pointwise_metric.PointwiseMetric(
|
||||
metric="rb_multimodal_understanding",
|
||||
metric_prompt_template=_default_templates.MULTIMODAL_UNDERSTANDING_RUBRIC_CRITIQUE_TEMPLATE,
|
||||
custom_output_config=_POINTWISE_OUTPUT_CONFIG,
|
||||
),
|
||||
)
|
||||
TEXT_QUALITY = rubric_based_metric.RubricBasedMetric(
|
||||
generation_config=metrics_base.RubricGenerationConfig(
|
||||
prompt_template=_default_templates.TEXT_QUALITY_RUBRIC_GENERATION_PROMPT_TEMPLATE
|
||||
),
|
||||
critique_metric=pointwise_metric.PointwiseMetric(
|
||||
metric="rb_text_quality",
|
||||
metric_prompt_template=_default_templates.TEXT_QUALITY_RUBRIC_CRITIQUE_TEMPLATE,
|
||||
custom_output_config=_POINTWISE_OUTPUT_CONFIG,
|
||||
),
|
||||
)
|
||||
|
||||
class Pairwise:
|
||||
"""Pairwise rubric-based metrics."""
|
||||
|
||||
INSTRUCTION_FOLLOWING = rubric_based_metric.RubricBasedMetric(
|
||||
generation_config=metrics_base.RubricGenerationConfig(
|
||||
prompt_template=_default_templates.INSTRUCTION_FOLLOWING_RUBRIC_GENERATION_PROMPT_TEMPLATE,
|
||||
),
|
||||
critique_metric=pairwise_metric.PairwiseMetric(
|
||||
metric="pairwise_rb_instruction_following",
|
||||
metric_prompt_template=_default_templates.PAIRWISE_INSTRUCTION_FOLLOWING_RUBRIC_CRITIQUE_TEMPLATE,
|
||||
custom_output_config=_PAIRWISE_OUTPUT_CONFIG,
|
||||
autorater_config=_PAIRWISE_AUTORATER_CONFIG,
|
||||
),
|
||||
)
|
||||
MULTIMODAL_UNDERSTANDING = rubric_based_metric.RubricBasedMetric(
|
||||
generation_config=metrics_base.RubricGenerationConfig(
|
||||
prompt_template=_default_templates.MULTIMODAL_UNDERSTANDING_RUBRIC_GENERATION_PROMPT_TEMPLATE
|
||||
),
|
||||
critique_metric=pairwise_metric.PairwiseMetric(
|
||||
metric="pairwise_rb_multimodal_understanding",
|
||||
metric_prompt_template=_default_templates.PAIRWISE_MULTIMODAL_UNDERSTANDING_RUBRIC_CRITIQUE_TEMPLATE,
|
||||
custom_output_config=_PAIRWISE_OUTPUT_CONFIG,
|
||||
autorater_config=_PAIRWISE_AUTORATER_CONFIG,
|
||||
),
|
||||
)
|
||||
TEXT_QUALITY = rubric_based_metric.RubricBasedMetric(
|
||||
generation_config=metrics_base.RubricGenerationConfig(
|
||||
prompt_template=_default_templates.TEXT_QUALITY_RUBRIC_GENERATION_PROMPT_TEMPLATE
|
||||
),
|
||||
critique_metric=pairwise_metric.PairwiseMetric(
|
||||
metric="pairwise_rb_text_quality",
|
||||
metric_prompt_template=_default_templates.PAIRWISE_TEXT_QUALITY_RUBRIC_CRITIQUE_TEMPLATE,
|
||||
custom_output_config=_PAIRWISE_OUTPUT_CONFIG,
|
||||
autorater_config=_PAIRWISE_AUTORATER_CONFIG,
|
||||
),
|
||||
)
|
||||
@@ -0,0 +1,104 @@
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import collections
|
||||
from typing import Union, TYPE_CHECKING
|
||||
|
||||
from google.cloud.aiplatform import base
|
||||
from vertexai import generative_models
|
||||
from vertexai.preview.evaluation import _pre_eval_utils
|
||||
from vertexai.preview.evaluation import constants
|
||||
from vertexai.preview.evaluation import utils
|
||||
from vertexai.preview.evaluation.metrics import (
|
||||
_base as metrics_base,
|
||||
)
|
||||
from vertexai.preview.evaluation.metrics import pairwise_metric
|
||||
from vertexai.preview.evaluation.metrics import pointwise_metric
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pandas as pd
|
||||
|
||||
_DEFAULT_MODEL_NAME = "gemini-2.0-flash-001"
|
||||
_LOGGER = base.Logger(__name__)
|
||||
|
||||
|
||||
class RubricBasedMetric(metrics_base._Metric):
|
||||
"""Config for Rubric-Based Eval."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
generation_config: metrics_base.RubricGenerationConfig,
|
||||
critique_metric: Union[
|
||||
pointwise_metric.PointwiseMetric, pairwise_metric.PairwiseMetric
|
||||
]
|
||||
):
|
||||
"""Initializes RubricBasedMetric.
|
||||
|
||||
Args:
|
||||
generation_config: Config for rubric generation.
|
||||
critique_metric: Pointwise/pairwise metric for rubric critique.
|
||||
"""
|
||||
super().__init__(metric=critique_metric._metric)
|
||||
|
||||
self.generation_config = generation_config
|
||||
self.critique_metric = critique_metric
|
||||
|
||||
def generate_rubrics(
|
||||
self,
|
||||
eval_dataset: "pd.Dataframe",
|
||||
) -> "pd.DataFrame":
|
||||
"""Generates rubrics for given eval dataset."""
|
||||
if not self.generation_config.model:
|
||||
model = generative_models.GenerativeModel(model_name=_DEFAULT_MODEL_NAME)
|
||||
else:
|
||||
model = self.generation_config.model
|
||||
|
||||
if constants.Dataset.RUBRICS_COLUMN in eval_dataset.columns:
|
||||
_LOGGER.warning(
|
||||
"Rubrics column already exists in the dataset. Skipping rubric"
|
||||
" generation."
|
||||
)
|
||||
return eval_dataset
|
||||
|
||||
responses = _pre_eval_utils._generate_responses_from_gemini_model(
|
||||
model,
|
||||
eval_dataset,
|
||||
self.generation_config.prompt_template,
|
||||
)
|
||||
if self.generation_config.parsing_fn:
|
||||
parsing_fn = self.generation_config.parsing_fn
|
||||
else:
|
||||
parsing_fn = utils.parse_rubrics
|
||||
dataset_with_rubrics = eval_dataset.copy()
|
||||
aggregated = collections.defaultdict(list)
|
||||
for idx, response in enumerate(responses):
|
||||
result = parsing_fn(response)
|
||||
if isinstance(result, dict):
|
||||
questions = result.pop("questions", None)
|
||||
if questions is not None:
|
||||
aggregated[constants.Dataset.RUBRICS_COLUMN].append(
|
||||
(idx, questions)
|
||||
)
|
||||
for key, value in result.items():
|
||||
aggregated[key].append((idx, value))
|
||||
else:
|
||||
aggregated[constants.Dataset.RUBRICS_COLUMN].append((idx, result))
|
||||
for key, values in aggregated.items():
|
||||
dataset_with_rubrics[key] = None
|
||||
dataset_with_rubrics[key] = dataset_with_rubrics[key].astype(object)
|
||||
for idx, value in values:
|
||||
dataset_with_rubrics.at[idx, key] = value
|
||||
return dataset_with_rubrics
|
||||
Reference in New Issue
Block a user