mirror of
https://github.com/EvolutionAPI/adk-python.git
synced 2025-12-19 03:42:22 -06:00
Agent Development Kit(ADK)
An easy-to-use and powerful framework to build AI agents.
This commit is contained in:
270
src/google/adk/evaluation/evaluation_generator.py
Normal file
270
src/google/adk/evaluation/evaluation_generator.py
Normal file
@@ -0,0 +1,270 @@
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import importlib
|
||||
import uuid
|
||||
|
||||
from google.genai import types
|
||||
|
||||
from ..agents.base_agent import BaseAgent
|
||||
from ..agents.llm_agent import Agent
|
||||
from ..agents.llm_agent import BeforeToolCallback
|
||||
from ..agents.llm_agent import LlmAgent
|
||||
from ..artifacts.in_memory_artifact_service import InMemoryArtifactService
|
||||
from ..runners import Runner
|
||||
from ..sessions.in_memory_session_service import InMemorySessionService
|
||||
from ..sessions.session import Session
|
||||
from .evaluation_constants import EvalConstants
|
||||
|
||||
|
||||
class EvaluationGenerator:
|
||||
"""Generates evaluation responses for agents."""
|
||||
|
||||
@staticmethod
|
||||
def generate_responses(
|
||||
eval_dataset,
|
||||
agent_module_path,
|
||||
repeat_num=3,
|
||||
agent_name=None,
|
||||
initial_session={},
|
||||
):
|
||||
"""Returns evaluation responses for the given dataset and agent.
|
||||
|
||||
Args:
|
||||
eval_dataset: The dataset that needs to be scraped for resposnes.
|
||||
agent_module_path: Path to the module that contains the root agent.
|
||||
repeat_num: Number of time the eval dataset should be repeated. This is
|
||||
usually done to remove uncertainity that a single run may bring.
|
||||
agent_name: The name of the agent that should be evaluated. This is
|
||||
usually the sub-agent.
|
||||
initial_session: Initial session for the eval data.
|
||||
"""
|
||||
results = []
|
||||
|
||||
for _ in range(repeat_num):
|
||||
for data in eval_dataset:
|
||||
results.append(
|
||||
EvaluationGenerator._process_query(
|
||||
data, agent_module_path, agent_name, initial_session
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def generate_responses_from_session(session_path, eval_dataset):
|
||||
"""Returns evaluation responses by combining session data with eval data.
|
||||
|
||||
Args:
|
||||
session_path: Path to a json file that contains session data.
|
||||
eval_dataset: The eval data set that should be combined with the session
|
||||
data.
|
||||
"""
|
||||
results = []
|
||||
|
||||
with open(session_path, "r") as f:
|
||||
session_data = Session.model_validate_json(f.read())
|
||||
print("loaded session", session_path)
|
||||
|
||||
for data in eval_dataset:
|
||||
# load session data from session_path
|
||||
results.append(
|
||||
EvaluationGenerator._process_query_with_session(
|
||||
session_data,
|
||||
data,
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def _process_query(data, module_name, agent_name=None, initial_session={}):
|
||||
"""Process a query using the agent and evaluation dataset."""
|
||||
module_path = f"{module_name}"
|
||||
agent_module = importlib.import_module(module_path)
|
||||
root_agent = agent_module.agent.root_agent
|
||||
|
||||
reset_func = getattr(agent_module.agent, "reset_data", None)
|
||||
|
||||
agent_to_evaluate = root_agent
|
||||
if agent_name:
|
||||
agent_to_evaluate = root_agent.find_agent(agent_name)
|
||||
assert agent_to_evaluate, f"Sub-Agent `{agent_name}` not found."
|
||||
|
||||
return EvaluationGenerator._process_query_with_root_agent(
|
||||
data, agent_to_evaluate, reset_func, initial_session
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _process_query_with_root_agent(
|
||||
data,
|
||||
root_agent,
|
||||
reset_func,
|
||||
initial_session={},
|
||||
session_id=None,
|
||||
session_service=None,
|
||||
artifact_service=None,
|
||||
):
|
||||
"""Process a query using the agent and evaluation dataset."""
|
||||
|
||||
# we don't know which tools belong to which agent
|
||||
# so we just apply to any agents that has certain tool outputs
|
||||
all_mock_tools = set()
|
||||
for eval_entry in data:
|
||||
expected_tool_use = eval_entry.get(EvalConstants.EXPECTED_TOOL_USE, [])
|
||||
for expected in expected_tool_use:
|
||||
if EvalConstants.MOCK_TOOL_OUTPUT in expected:
|
||||
all_mock_tools.add(expected[EvalConstants.TOOL_NAME])
|
||||
|
||||
eval_data_copy = data.copy()
|
||||
EvaluationGenerator.apply_before_tool_callback(
|
||||
root_agent,
|
||||
lambda *args: EvaluationGenerator.before_tool_callback(
|
||||
*args, eval_dataset=eval_data_copy
|
||||
),
|
||||
all_mock_tools,
|
||||
)
|
||||
|
||||
if not session_service:
|
||||
session_service = InMemorySessionService()
|
||||
|
||||
app_name = initial_session.get("app_name", "EvaluationGenerator")
|
||||
user_id = initial_session.get("user_id", "test_user_id")
|
||||
session_id = session_id if session_id else str(uuid.uuid4())
|
||||
|
||||
_ = session_service.create_session(
|
||||
app_name=app_name,
|
||||
user_id=user_id,
|
||||
state=initial_session.get("state", {}),
|
||||
session_id=session_id,
|
||||
)
|
||||
|
||||
if not artifact_service:
|
||||
artifact_service = InMemoryArtifactService()
|
||||
runner = Runner(
|
||||
app_name=app_name,
|
||||
agent=root_agent,
|
||||
artifact_service=artifact_service,
|
||||
session_service=session_service,
|
||||
)
|
||||
|
||||
# Reset agent state for each query
|
||||
if callable(reset_func):
|
||||
reset_func()
|
||||
|
||||
responses = data.copy()
|
||||
|
||||
for index, eval_entry in enumerate(responses):
|
||||
response = None
|
||||
query = eval_entry["query"]
|
||||
content = types.Content(role="user", parts=[types.Part(text=query)])
|
||||
turn_actual_tool_uses = []
|
||||
|
||||
for event in runner.run(
|
||||
user_id=user_id, session_id=session_id, new_message=content
|
||||
):
|
||||
if event.is_final_response() and event.content and event.content.parts:
|
||||
response = event.content.parts[0].text
|
||||
elif event.get_function_calls():
|
||||
for call in event.get_function_calls():
|
||||
turn_actual_tool_uses.append({
|
||||
EvalConstants.TOOL_NAME: call.name,
|
||||
EvalConstants.TOOL_INPUT: call.args,
|
||||
})
|
||||
|
||||
responses[index]["actual_tool_use"] = turn_actual_tool_uses
|
||||
responses[index]["response"] = response
|
||||
|
||||
return responses
|
||||
|
||||
@staticmethod
|
||||
def _process_query_with_session(session_data, data):
|
||||
"""Process the queries using the existing session data without invoking the runner."""
|
||||
responses = data.copy()
|
||||
|
||||
# Iterate through the provided queries and align them with the session events
|
||||
for index, eval_entry in enumerate(responses):
|
||||
query = eval_entry["query"]
|
||||
actual_tool_uses = []
|
||||
response = None
|
||||
|
||||
# Search for the corresponding session events
|
||||
for event in session_data.events:
|
||||
# Match the query to a user event
|
||||
if (
|
||||
event.author == "user"
|
||||
and event.content
|
||||
and event.content.parts
|
||||
and event.content.parts[0].text == query
|
||||
):
|
||||
# Look for subsequent tool usage or model responses
|
||||
for subsequent_event in session_data.events:
|
||||
if subsequent_event.invocation_id == event.invocation_id:
|
||||
# Extract tool usage
|
||||
if subsequent_event.content.parts[0].function_call:
|
||||
call = subsequent_event.content.parts[0].function_call
|
||||
actual_tool_uses.append(
|
||||
{"tool_name": call.name, "tool_input": call.args}
|
||||
)
|
||||
# Extract final response
|
||||
elif subsequent_event.author != "user":
|
||||
response = subsequent_event.content.parts[0].text
|
||||
|
||||
# Update the results for the current query
|
||||
responses[index]["actual_tool_use"] = actual_tool_uses
|
||||
responses[index]["response"] = response
|
||||
return responses
|
||||
|
||||
@staticmethod
|
||||
def before_tool_callback(tool, args, tool_context, eval_dataset):
|
||||
"""Intercept specific tool calls and return predefined outputs
|
||||
|
||||
from eval_dataset.
|
||||
"""
|
||||
for index, eval_entry in enumerate(eval_dataset):
|
||||
expected_tool_use = eval_entry.get("expected_tool_use", [])
|
||||
for expected in expected_tool_use:
|
||||
if (
|
||||
EvalConstants.MOCK_TOOL_OUTPUT in expected
|
||||
and tool.name == expected[EvalConstants.TOOL_NAME]
|
||||
and args == expected.get(EvalConstants.TOOL_INPUT, {})
|
||||
):
|
||||
# pop the matched entry so we don't rematch again
|
||||
eval_dataset.pop(index)
|
||||
return {"result": expected[EvalConstants.MOCK_TOOL_OUTPUT]}
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def apply_before_tool_callback(
|
||||
agent: BaseAgent,
|
||||
callback: BeforeToolCallback,
|
||||
all_mock_tools: set[str],
|
||||
):
|
||||
"""Recursively apply the before_tool_callback to the root agent and all its subagents."""
|
||||
# check if the agent has tools that defined by evalset
|
||||
# We use function name to check if tools match
|
||||
if not isinstance(agent, Agent) and not isinstance(agent, LlmAgent):
|
||||
return
|
||||
|
||||
for tool in agent.canonical_tools:
|
||||
tool_name = tool.name
|
||||
if tool_name in all_mock_tools:
|
||||
agent.before_tool_callback = callback
|
||||
|
||||
# Apply recursively to subagents if they exist
|
||||
for sub_agent in agent.sub_agents:
|
||||
EvaluationGenerator.apply_before_tool_callback(
|
||||
sub_agent, callback, all_mock_tools
|
||||
)
|
||||
Reference in New Issue
Block a user