mirror of
https://github.com/EvolutionAPI/adk-python.git
synced 2025-07-13 07:04:51 -06:00

-- 16994cb2d5d646341f5285ca71d72697d81d18fe by Nilanjan De <nilanjan.de@gmail.com>: chore: fix typos COPYBARA_INTEGRATE_REVIEW=https://github.com/google/adk-python/pull/272 from n1lanjan:fix-typos a1ab655b08ec08c5dd2da71aab9a2386e3610e84 PiperOrigin-RevId: 749690489
148 lines
4.8 KiB
Python
148 lines
4.8 KiB
Python
# Copyright 2025 Google LLC
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from typing import Any
|
|
|
|
import pandas as pd
|
|
from tabulate import tabulate
|
|
from vertexai.preview.evaluation import EvalTask
|
|
from vertexai.preview.evaluation import MetricPromptTemplateExamples
|
|
|
|
|
|
class ResponseEvaluator:
|
|
"""Runs response evaluation for agents."""
|
|
|
|
@staticmethod
|
|
def evaluate(
|
|
raw_eval_dataset: list[list[dict[str, Any]]],
|
|
evaluation_criteria: list[str],
|
|
*,
|
|
print_detailed_results: bool = False
|
|
):
|
|
r"""Returns the value of requested evaluation metrics.
|
|
|
|
Args:
|
|
raw_eval_dataset: The dataset that will be evaluated.
|
|
evaluation_criteria: The evaluation criteria to be used. This method
|
|
support two criteria, `response_evaluation_score` and
|
|
`response_match_score`.
|
|
print_detailed_results: Prints detailed results on the console. This is
|
|
usually helpful during debugging.
|
|
|
|
A note on evaluation_criteria:
|
|
`response_match_score`: This metric compares the agents final natural
|
|
language response with the expected final response, stored in the
|
|
"reference" field in test/eval files. We use Rouge metric to compare the
|
|
two responses.
|
|
|
|
Value Range: [0, 1]. A score closer to 0 means poor similarity between
|
|
response and reference. A score closer to 1 means strong similarity
|
|
between response and reference.
|
|
|
|
`response_evaluation_score`: Uses LLM to evalaute coherence of the
|
|
response, including tool use. This is pointwise metric.
|
|
|
|
Value range: [0, 5], where 0 means that the agent's response is not
|
|
coherent, while 5 means it is . High values are good.
|
|
A note on raw_eval_dataset:
|
|
The dataset should be a list session, where each session is represented
|
|
as a list of interaction that need evaluation. Each evaluation is
|
|
represented as a dictionary that is expected to have values for the
|
|
following keys:
|
|
|
|
1) query
|
|
2) response
|
|
3) acutal_tool_use
|
|
4) expected_tool_use
|
|
5) reference
|
|
|
|
Here is a sample eval_dataset value with one entry:
|
|
[
|
|
[
|
|
{
|
|
"query": "roll a die for me",
|
|
"response": "I rolled a 16 sided die and got 13.\n",
|
|
"expected_tool_use": [
|
|
{
|
|
"tool_name": "roll_die",
|
|
"tool_input": {
|
|
"sides": 16
|
|
}
|
|
}
|
|
],
|
|
"acutal_tool_use": [
|
|
{
|
|
"tool_name": "roll_die",
|
|
"tool_input": {
|
|
"sides": 16
|
|
}
|
|
}
|
|
],
|
|
"reference": "I rolled a 16 sided die and got 13.\n"
|
|
}
|
|
]
|
|
]
|
|
"""
|
|
if not raw_eval_dataset:
|
|
raise ValueError("The evaluation dataset is empty.")
|
|
|
|
metrics = ResponseEvaluator._get_metrics(
|
|
raw_eval_dataset, evaluation_criteria
|
|
)
|
|
flattened_queries = [
|
|
item for sublist in raw_eval_dataset for item in sublist
|
|
]
|
|
eval_dataset = pd.DataFrame(flattened_queries).rename(
|
|
columns={"query": "prompt", "expected_tool_use": "reference_trajectory"}
|
|
)
|
|
|
|
eval_result = ResponseEvaluator._perform_eval(
|
|
dataset=eval_dataset, metrics=metrics
|
|
)
|
|
|
|
if print_detailed_results:
|
|
ResponseEvaluator._print_results(eval_result)
|
|
return eval_result.summary_metrics
|
|
|
|
@staticmethod
|
|
def _get_metrics(raw_eval_dataset, criteria):
|
|
metrics = []
|
|
if (
|
|
"response_evaluation_score" in criteria
|
|
and "query" in raw_eval_dataset[0][0]
|
|
and "expected_tool_use" in raw_eval_dataset[0][0]
|
|
):
|
|
metrics.append(MetricPromptTemplateExamples.Pointwise.COHERENCE)
|
|
if (
|
|
"response_match_score" in criteria
|
|
and "reference" in raw_eval_dataset[0][0]
|
|
):
|
|
metrics.append("rouge_1")
|
|
return metrics
|
|
|
|
@staticmethod
|
|
def _perform_eval(dataset, metrics):
|
|
"""This method hides away the call to external service.
|
|
|
|
Primarily helps with unit testing.
|
|
"""
|
|
eval_task = EvalTask(dataset=dataset, metrics=metrics)
|
|
|
|
return eval_task.evaluate()
|
|
|
|
@staticmethod
|
|
def _print_results(eval_result):
|
|
print("Evaluation Summary Metrics:", eval_result.summary_metrics)
|
|
print(tabulate(eval_result.metrics_table, headers="keys", tablefmt="grid"))
|