diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py index fc9d5af..7a21cf8 100644 --- a/src/google/adk/cli/cli_eval.py +++ b/src/google/adk/cli/cli_eval.py @@ -20,7 +20,7 @@ import os import sys import traceback from typing import Any -from typing import Generator +from typing import AsyncGenerator from typing import Optional import uuid @@ -146,7 +146,7 @@ def parse_and_get_evals_to_run( return eval_set_to_evals -def run_evals( +async def run_evals( eval_set_to_evals: dict[str, list[str]], root_agent: Agent, reset_func: Optional[Any], @@ -154,7 +154,7 @@ def run_evals( session_service=None, artifact_service=None, print_detailed_results=False, -) -> Generator[EvalResult, None, None]: +) -> AsyncGenerator[EvalResult, None]: try: from ..evaluation.agent_evaluator import EvaluationGenerator from ..evaluation.response_evaluator import ResponseEvaluator @@ -181,14 +181,16 @@ def run_evals( print(f"Running Eval: {eval_set_file}:{eval_name}") session_id = f"{EVAL_SESSION_ID_PREFIX}{str(uuid.uuid4())}" - scrape_result = EvaluationGenerator._process_query_with_root_agent( - data=eval_data, - root_agent=root_agent, - reset_func=reset_func, - initial_session=initial_session, - session_id=session_id, - session_service=session_service, - artifact_service=artifact_service, + scrape_result = ( + await EvaluationGenerator._process_query_with_root_agent( + data=eval_data, + root_agent=root_agent, + reset_func=reset_func, + initial_session=initial_session, + session_id=session_id, + session_service=session_service, + artifact_service=artifact_service, + ) ) eval_metric_results = [] diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py index 83a2c3f..2898673 100644 --- a/src/google/adk/cli/cli_tools_click.py +++ b/src/google/adk/cli/cli_tools_click.py @@ -258,12 +258,14 @@ def cli_eval( try: eval_results = list( - run_evals( - eval_set_to_evals, - root_agent, - reset_func, - eval_metrics, - print_detailed_results=print_detailed_results, + asyncio.run( + run_evals( + eval_set_to_evals, + root_agent, + reset_func, + eval_metrics, + print_detailed_results=print_detailed_results, + ) ) ) except ModuleNotFoundError: diff --git a/src/google/adk/cli/fast_api.py b/src/google/adk/cli/fast_api.py index ed663a5..aa3de52 100644 --- a/src/google/adk/cli/fast_api.py +++ b/src/google/adk/cli/fast_api.py @@ -467,7 +467,7 @@ def get_fast_api_app( ) root_agent = await _get_root_agent_async(app_name) eval_results = list( - run_evals( + await run_evals( eval_set_to_evals, root_agent, getattr(root_agent, "reset_data", None), diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index 1de087f..d97cd1f 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -76,7 +76,7 @@ class AgentEvaluator: return DEFAULT_CRITERIA @staticmethod - def evaluate( + async def evaluate( agent_module, eval_dataset_file_path_or_dir, num_runs=NUM_RUNS, @@ -120,7 +120,7 @@ class AgentEvaluator: AgentEvaluator._validate_input([dataset], criteria) - evaluation_response = AgentEvaluator._generate_responses( + evaluation_response = await AgentEvaluator._generate_responses( agent_module, [dataset], num_runs, @@ -246,7 +246,7 @@ class AgentEvaluator: return inferred_criteria @staticmethod - def _generate_responses( + async def _generate_responses( agent_module, eval_dataset, num_runs, agent_name=None, initial_session={} ): """Generates evaluation responses by running the agent module multiple times.""" diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 3a43098..779c8ad 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -32,7 +32,7 @@ class EvaluationGenerator: """Generates evaluation responses for agents.""" @staticmethod - def generate_responses( + async def generate_responses( eval_dataset, agent_module_path, repeat_num=3, @@ -107,7 +107,7 @@ class EvaluationGenerator: ) @staticmethod - def _process_query_with_root_agent( + async def _process_query_with_root_agent( data, root_agent, reset_func, @@ -128,7 +128,7 @@ class EvaluationGenerator: all_mock_tools.add(expected[EvalConstants.TOOL_NAME]) eval_data_copy = data.copy() - EvaluationGenerator.apply_before_tool_callback( + await EvaluationGenerator.apply_before_tool_callback( root_agent, lambda *args: EvaluationGenerator.before_tool_callback( *args, eval_dataset=eval_data_copy @@ -247,7 +247,7 @@ class EvaluationGenerator: return None @staticmethod - def apply_before_tool_callback( + async def apply_before_tool_callback( agent: BaseAgent, callback: BeforeToolCallback, all_mock_tools: set[str], @@ -265,6 +265,6 @@ class EvaluationGenerator: # Apply recursively to subagents if they exist for sub_agent in agent.sub_agents: - EvaluationGenerator.apply_before_tool_callback( + await EvaluationGenerator.apply_before_tool_callback( sub_agent, callback, all_mock_tools ) diff --git a/tests/integration/test_evalute_agent_in_fixture.py b/tests/integration/test_evalute_agent_in_fixture.py index 3899a3f..8f9b77f 100644 --- a/tests/integration/test_evalute_agent_in_fixture.py +++ b/tests/integration/test_evalute_agent_in_fixture.py @@ -51,12 +51,13 @@ def agent_eval_artifacts_in_fixture(): return agent_eval_artifacts +@pytest.mark.asyncio @pytest.mark.parametrize( 'agent_name, evalfile, initial_session_file', agent_eval_artifacts_in_fixture(), ids=[agent_name for agent_name, _, _ in agent_eval_artifacts_in_fixture()], ) -def test_evaluate_agents_long_running_4_runs_per_eval_item( +async def test_evaluate_agents_long_running_4_runs_per_eval_item( agent_name, evalfile, initial_session_file ): """Test agents evaluation in fixture folder. @@ -66,7 +67,7 @@ def test_evaluate_agents_long_running_4_runs_per_eval_item( A single eval item is a session that can have multiple queries in it. """ - AgentEvaluator.evaluate( + await AgentEvaluator.evaluate( agent_module=agent_name, eval_dataset_file_path_or_dir=evalfile, initial_session_file=initial_session_file, diff --git a/tests/integration/test_multi_agent.py b/tests/integration/test_multi_agent.py index 48b5897..81beed1 100644 --- a/tests/integration/test_multi_agent.py +++ b/tests/integration/test_multi_agent.py @@ -15,7 +15,8 @@ from google.adk.evaluation import AgentEvaluator -def test_eval_agent(): +@pytest.mark.asyncio +async def test_eval_agent(): AgentEvaluator.evaluate( agent_module="tests.integration.fixture.trip_planner_agent", eval_dataset_file_path_or_dir=( diff --git a/tests/integration/test_multi_turn.py b/tests/integration/test_multi_turn.py index 0281082..ce56ede 100644 --- a/tests/integration/test_multi_turn.py +++ b/tests/integration/test_multi_turn.py @@ -15,7 +15,8 @@ from google.adk.evaluation import AgentEvaluator -def test_simple_multi_turn_conversation(): +@pytest.mark.asyncio +async def test_simple_multi_turn_conversation(): """Test a simple multi-turn conversation.""" AgentEvaluator.evaluate( agent_module="tests.integration.fixture.home_automation_agent", @@ -24,7 +25,8 @@ def test_simple_multi_turn_conversation(): ) -def test_dependent_tool_calls(): +@pytest.mark.asyncio +async def test_dependent_tool_calls(): """Test subsequent tool calls that are dependent on previous tool calls.""" AgentEvaluator.evaluate( agent_module="tests.integration.fixture.home_automation_agent", @@ -33,8 +35,10 @@ def test_dependent_tool_calls(): ) -def test_memorizing_past_events(): +@pytest.mark.asyncio +async def test_memorizing_past_events(): """Test memorizing past events.""" + AgentEvaluator.evaluate( agent_module="tests.integration.fixture.home_automation_agent", eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json", diff --git a/tests/integration/test_single_agent.py b/tests/integration/test_single_agent.py index 5c7c26d..cb18ce8 100644 --- a/tests/integration/test_single_agent.py +++ b/tests/integration/test_single_agent.py @@ -15,7 +15,8 @@ from google.adk.evaluation import AgentEvaluator -def test_eval_agent(): +@pytest.mark.asyncio +async def test_eval_agent(): AgentEvaluator.evaluate( agent_module="tests.integration.fixture.home_automation_agent", eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/simple_test.test.json", diff --git a/tests/integration/test_sub_agent.py b/tests/integration/test_sub_agent.py index 27646bf..6eb7192 100644 --- a/tests/integration/test_sub_agent.py +++ b/tests/integration/test_sub_agent.py @@ -15,7 +15,8 @@ from google.adk.evaluation import AgentEvaluator -def test_eval_agent(): +@pytest.mark.asyncio +async def test_eval_agent(): """Test hotel sub agent in a multi-agent system.""" AgentEvaluator.evaluate( agent_module="tests.integration.fixture.trip_planner_agent", diff --git a/tests/integration/test_with_test_file.py b/tests/integration/test_with_test_file.py index 2024e1d..68c9ba3 100644 --- a/tests/integration/test_with_test_file.py +++ b/tests/integration/test_with_test_file.py @@ -15,7 +15,8 @@ from google.adk.evaluation import AgentEvaluator -def test_with_single_test_file(): +@pytest.mark.asyncio +async def test_with_single_test_file(): """Test the agent's basic ability via session file.""" AgentEvaluator.evaluate( agent_module="tests.integration.fixture.home_automation_agent", @@ -23,7 +24,8 @@ def test_with_single_test_file(): ) -def test_with_folder_of_test_files_long_running(): +@pytest.mark.asyncio +async def test_with_folder_of_test_files_long_running(): """Test the agent's basic ability via a folder of session files.""" AgentEvaluator.evaluate( agent_module="tests.integration.fixture.home_automation_agent",