adk-python/src/google/adk/evaluation/response_evaluator.py
Nilanjan De 1664b45562 Copybara import of the project:
--
16994cb2d5d646341f5285ca71d72697d81d18fe by Nilanjan De <nilanjan.de@gmail.com>:

chore: fix typos
COPYBARA_INTEGRATE_REVIEW=https://github.com/google/adk-python/pull/272 from n1lanjan:fix-typos a1ab655b08ec08c5dd2da71aab9a2386e3610e84
PiperOrigin-RevId: 749690489
2025-04-20 22:53:15 -07:00

148 lines
4.8 KiB
Python

# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any
import pandas as pd
from tabulate import tabulate
from vertexai.preview.evaluation import EvalTask
from vertexai.preview.evaluation import MetricPromptTemplateExamples
class ResponseEvaluator:
"""Runs response evaluation for agents."""
@staticmethod
def evaluate(
raw_eval_dataset: list[list[dict[str, Any]]],
evaluation_criteria: list[str],
*,
print_detailed_results: bool = False
):
r"""Returns the value of requested evaluation metrics.
Args:
raw_eval_dataset: The dataset that will be evaluated.
evaluation_criteria: The evaluation criteria to be used. This method
support two criteria, `response_evaluation_score` and
`response_match_score`.
print_detailed_results: Prints detailed results on the console. This is
usually helpful during debugging.
A note on evaluation_criteria:
`response_match_score`: This metric compares the agents final natural
language response with the expected final response, stored in the
"reference" field in test/eval files. We use Rouge metric to compare the
two responses.
Value Range: [0, 1]. A score closer to 0 means poor similarity between
response and reference. A score closer to 1 means strong similarity
between response and reference.
`response_evaluation_score`: Uses LLM to evalaute coherence of the
response, including tool use. This is pointwise metric.
Value range: [0, 5], where 0 means that the agent's response is not
coherent, while 5 means it is . High values are good.
A note on raw_eval_dataset:
The dataset should be a list session, where each session is represented
as a list of interaction that need evaluation. Each evaluation is
represented as a dictionary that is expected to have values for the
following keys:
1) query
2) response
3) acutal_tool_use
4) expected_tool_use
5) reference
Here is a sample eval_dataset value with one entry:
[
[
{
"query": "roll a die for me",
"response": "I rolled a 16 sided die and got 13.\n",
"expected_tool_use": [
{
"tool_name": "roll_die",
"tool_input": {
"sides": 16
}
}
],
"acutal_tool_use": [
{
"tool_name": "roll_die",
"tool_input": {
"sides": 16
}
}
],
"reference": "I rolled a 16 sided die and got 13.\n"
}
]
]
"""
if not raw_eval_dataset:
raise ValueError("The evaluation dataset is empty.")
metrics = ResponseEvaluator._get_metrics(
raw_eval_dataset, evaluation_criteria
)
flattened_queries = [
item for sublist in raw_eval_dataset for item in sublist
]
eval_dataset = pd.DataFrame(flattened_queries).rename(
columns={"query": "prompt", "expected_tool_use": "reference_trajectory"}
)
eval_result = ResponseEvaluator._perform_eval(
dataset=eval_dataset, metrics=metrics
)
if print_detailed_results:
ResponseEvaluator._print_results(eval_result)
return eval_result.summary_metrics
@staticmethod
def _get_metrics(raw_eval_dataset, criteria):
metrics = []
if (
"response_evaluation_score" in criteria
and "query" in raw_eval_dataset[0][0]
and "expected_tool_use" in raw_eval_dataset[0][0]
):
metrics.append(MetricPromptTemplateExamples.Pointwise.COHERENCE)
if (
"response_match_score" in criteria
and "reference" in raw_eval_dataset[0][0]
):
metrics.append("rouge_1")
return metrics
@staticmethod
def _perform_eval(dataset, metrics):
"""This method hides away the call to external service.
Primarily helps with unit testing.
"""
eval_task = EvalTask(dataset=dataset, metrics=metrics)
return eval_task.evaluate()
@staticmethod
def _print_results(eval_result):
print("Evaluation Summary Metrics:", eval_result.summary_metrics)
print(tabulate(eval_result.metrics_table, headers="keys", tablefmt="grid"))