mirror of
https://github.com/EvolutionAPI/adk-python.git
synced 2025-12-24 22:27:44 -06:00
Agent Development Kit(ADK)
An easy-to-use and powerful framework to build AI agents.
This commit is contained in:
184
src/google/adk/evaluation/trajectory_evaluator.py
Normal file
184
src/google/adk/evaluation/trajectory_evaluator.py
Normal file
@@ -0,0 +1,184 @@
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd
|
||||
from tabulate import tabulate
|
||||
|
||||
from .evaluation_constants import EvalConstants
|
||||
|
||||
|
||||
class TrajectoryEvaluator:
|
||||
"""Evaluates tool use trajectories for accuracy."""
|
||||
|
||||
@staticmethod
|
||||
def evaluate(
|
||||
eval_dataset: list[list[dict[str, Any]]],
|
||||
*,
|
||||
print_detailed_results: bool = False,
|
||||
):
|
||||
r"""Returns the mean tool use accuracy of the eval dataset.
|
||||
|
||||
Tool use accuracy is calculated by comparing the expected and actuall tool
|
||||
use trajectories. An exact match scores a 1, 0 otherwise. The final number
|
||||
is an
|
||||
average of these individual scores.
|
||||
|
||||
Value range: [0, 1], where 0 is means none of the too use entries aligned,
|
||||
and 1 would mean all of them aligned. Higher value is good.
|
||||
|
||||
Args:
|
||||
eval_dataset: The dataset that will be evaluated.
|
||||
print_detailed_results: Prints detailed results on the console. This is
|
||||
usually helpful during debugging.
|
||||
|
||||
A note on eval_dataset:
|
||||
The dataset should be a list session, where each sesssion is represented
|
||||
as a list of interaction that need evaluation. Each evaluation is
|
||||
represented as a dictionary that is expected to have values for the
|
||||
following keys:
|
||||
1) query
|
||||
2) response
|
||||
3) acutal_tool_use
|
||||
4) expected_tool_use
|
||||
|
||||
Here is a sample eval_dataset value with one entry:
|
||||
|
||||
[
|
||||
[
|
||||
{
|
||||
"query": "Roll a 16 sided dice for me",
|
||||
"response": "I rolled a 16 sided die and got 13.\n",
|
||||
"expected_tool_use": [
|
||||
{
|
||||
"tool_name": "roll_die",
|
||||
"tool_input": {
|
||||
"sides": 16
|
||||
}
|
||||
}
|
||||
],
|
||||
"acutal_tool_use": [
|
||||
{
|
||||
"tool_name": "roll_die",
|
||||
"tool_input": {
|
||||
"sides": 16
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
]
|
||||
"""
|
||||
if not eval_dataset:
|
||||
raise ValueError("The evaluation dataset is empty.")
|
||||
|
||||
results_df = pd.DataFrame(
|
||||
columns=[
|
||||
"query",
|
||||
"response",
|
||||
"actual_tool_use",
|
||||
"expected_tool_use",
|
||||
"tool_use_accuracy",
|
||||
]
|
||||
)
|
||||
failures = []
|
||||
|
||||
for conversation in eval_dataset:
|
||||
for index, row in enumerate(conversation):
|
||||
new_row, failure = TrajectoryEvaluator._evaluate_row(row)
|
||||
results_df = pd.concat(
|
||||
[results_df, pd.DataFrame([new_row])], ignore_index=True
|
||||
)
|
||||
if failure:
|
||||
failure["turn"] = index + 1
|
||||
failures.append(failure)
|
||||
|
||||
TrajectoryEvaluator._report_failures(failures)
|
||||
|
||||
if print_detailed_results:
|
||||
TrajectoryEvaluator._print_results(results_df)
|
||||
|
||||
return results_df["tool_use_accuracy"].mean()
|
||||
|
||||
@staticmethod
|
||||
def _evaluate_row(row):
|
||||
# We don't evaluate the mock tool outputs.
|
||||
expected = TrajectoryEvaluator._remove_tool_outputs(
|
||||
row["expected_tool_use"]
|
||||
)
|
||||
actual = row["actual_tool_use"]
|
||||
tool_use_accuracy = (
|
||||
1.0 if TrajectoryEvaluator.are_tools_equal(actual, expected) else 0.0
|
||||
)
|
||||
|
||||
new_row = {
|
||||
"query": row["query"],
|
||||
"response": row["response"],
|
||||
"actual_tool_use": actual,
|
||||
"expected_tool_use": expected,
|
||||
"tool_use_accuracy": tool_use_accuracy,
|
||||
}
|
||||
failure = (
|
||||
None
|
||||
if tool_use_accuracy == 1.0
|
||||
else {"query": row["query"], "actual": actual, "expected": expected}
|
||||
)
|
||||
return new_row, failure
|
||||
|
||||
@staticmethod
|
||||
def are_tools_equal(list_a_original, list_b_original):
|
||||
# Remove other entries that we don't want to evaluate
|
||||
list_a = [
|
||||
{"tool_name": tool["tool_name"], "tool_input": tool["tool_input"]}
|
||||
for tool in list_a_original
|
||||
]
|
||||
|
||||
list_b = [
|
||||
{"tool_name": tool["tool_name"], "tool_input": tool["tool_input"]}
|
||||
for tool in list_b_original
|
||||
]
|
||||
|
||||
return list_a == list_b
|
||||
|
||||
@staticmethod
|
||||
def _remove_tool_outputs(tool_use_list):
|
||||
"""Removes 'mock_tool_output' from each dictionary in the list."""
|
||||
result = []
|
||||
for tool_use in tool_use_list:
|
||||
new_tool_use = (
|
||||
tool_use.copy()
|
||||
) # Create a copy to avoid modifying the original
|
||||
new_tool_use.pop(
|
||||
EvalConstants.MOCK_TOOL_OUTPUT, None
|
||||
) # Remove 'tool_output' if it exists
|
||||
result.append(new_tool_use)
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _report_failures(failures):
|
||||
if failures:
|
||||
print("Failures:")
|
||||
for failure in failures:
|
||||
print(f"""{{
|
||||
"turn": {failure["turn"]},
|
||||
"query": '{failure["query"]}',
|
||||
"actual": {failure["actual"]},
|
||||
"expected_tool_use": {failure["expected"]},
|
||||
}}
|
||||
""")
|
||||
|
||||
@staticmethod
|
||||
def _print_results(results_df):
|
||||
print(tabulate(results_df, headers="keys", tablefmt="grid"))
|
||||
Reference in New Issue
Block a user