diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index 734ee2e..a29af99 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -13,12 +13,18 @@ # limitations under the License. import json +import logging import os from os import path +from typing import Any from typing import Dict from typing import List +from typing import Optional from typing import Union import uuid + +from pydantic import ValidationError + from .eval_set import EvalSet from .evaluation_generator import EvaluationGenerator from .evaluator import EvalStatus @@ -28,6 +34,9 @@ from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema from .response_evaluator import ResponseEvaluator from .trajectory_evaluator import TrajectoryEvaluator +logger = logging.getLogger(__name__) + + # Constants for default runs and evaluation criteria NUM_RUNS = 2 TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score" @@ -131,18 +140,17 @@ class AgentEvaluator: ) assert evaluation_result.overall_eval_status == EvalStatus.PASSED, ( - f"`{eval_case_responses.eval_case.eval_id}`: " f"{metric_name} for {agent_module} Failed. Expected {threshold}," f" but got {evaluation_result.overall_score}." ) @staticmethod async def evaluate( - agent_module, - eval_dataset_file_path_or_dir, - num_runs=NUM_RUNS, - agent_name=None, - initial_session_file=None, + agent_module: str, + eval_dataset_file_path_or_dir: str, + num_runs: int = NUM_RUNS, + agent_name: Optional[str] = None, + initial_session_file: Optional[str] = None, ): """Evaluates an Agent given eval data. @@ -170,25 +178,14 @@ class AgentEvaluator: else: test_files = [eval_dataset_file_path_or_dir] - initial_session = {} - if initial_session_file: - with open(initial_session_file, "r") as f: - initial_session = json.loads(f.read()) + initial_session = AgentEvaluator._get_initial_session(initial_session_file) for test_file in test_files: - data = AgentEvaluator._load_dataset(test_file)[0] criteria = AgentEvaluator.find_config_for_test_file(test_file) - AgentEvaluator._validate_input([data], criteria) - - eval_data = { - "name": test_file, - "data": data, - "initial_session": initial_session, - } - - eval_set = convert_eval_set_to_pydanctic_schema( - eval_set_id=str(uuid.uuid4()), eval_set_in_json_format=[eval_data] + eval_set = AgentEvaluator._load_eval_set_from_file( + test_file, criteria, initial_session ) + await AgentEvaluator.evaluate_eval_set( agent_module=agent_module, eval_set=eval_set, @@ -197,6 +194,86 @@ class AgentEvaluator: agent_name=agent_name, ) + @staticmethod + def migrate_eval_data_to_new_schema( + old_eval_data_file: str, + new_eval_data_file: str, + initial_session_file: Optional[str] = None, + ): + """A utility for migrating eval data to new schema backed by EvalSet.""" + if not old_eval_data_file or not new_eval_data_file: + raise ValueError( + "One of old_eval_data_file or new_eval_data_file is empty." + ) + + criteria = AgentEvaluator.find_config_for_test_file(old_eval_data_file) + initial_session = AgentEvaluator._get_initial_session(initial_session_file) + + eval_set = AgentEvaluator._get_eval_set_from_old_format( + old_eval_data_file, criteria, initial_session + ) + + with open(new_eval_data_file, "w") as f: + f.write(eval_set.model_dump_json(indent=2)) + + @staticmethod + def _load_eval_set_from_file( + eval_set_file: str, + criteria: dict[str, float], + initial_session: dict[str, Any], + ) -> EvalSet: + """Loads an EvalSet from the given file.""" + if os.path.isfile(eval_set_file): + with open(eval_set_file, "r", encoding="utf-8") as f: + content = f.read() + + try: + eval_set = EvalSet.model_validate_json(content) + assert len(initial_session) == 0, ( + "Intial session should be specified as a part of EvalSet file." + " Explicit initial session is only needed, when specifying data in" + " the older schema." + ) + return eval_set + except ValidationError: + # We assume that the eval data was specified in the old format + logger.warning( + f"Contents of {eval_set_file} appear to be in older format.To avoid" + " this warning, please update your test files to contain data in" + " EvalSet schema. You can use `migrate_eval_data_to_new_schema`" + " for migrating your old test files." + ) + + # If we are here, the data must be specified in the older format. + return AgentEvaluator._get_eval_set_from_old_format( + eval_set_file, criteria, initial_session + ) + + @staticmethod + def _get_eval_set_from_old_format( + eval_set_file: str, + criteria: dict[str, float], + initial_session: dict[str, Any], + ) -> EvalSet: + data = AgentEvaluator._load_dataset(eval_set_file)[0] + AgentEvaluator._validate_input([data], criteria) + eval_data = { + "name": eval_set_file, + "data": data, + "initial_session": initial_session, + } + return convert_eval_set_to_pydanctic_schema( + eval_set_id=str(uuid.uuid4()), eval_set_in_json_format=[eval_data] + ) + + @staticmethod + def _get_initial_session(initial_session_file: Optional[str] = None): + initial_session = {} + if initial_session_file: + with open(initial_session_file, "r") as f: + initial_session = json.loads(f.read()) + return initial_session + @staticmethod def _load_dataset( input_data: Union[str, List[str], List[Dict], List[List[Dict]]], diff --git a/src/google/adk/evaluation/local_eval_sets_manager.py b/src/google/adk/evaluation/local_eval_sets_manager.py index 55f95a5..456893f 100644 --- a/src/google/adk/evaluation/local_eval_sets_manager.py +++ b/src/google/adk/evaluation/local_eval_sets_manager.py @@ -135,7 +135,10 @@ def convert_eval_set_to_pydanctic_schema( ) session_input = None - if "initial_session" in old_eval_case: + if ( + "initial_session" in old_eval_case + and len(old_eval_case["initial_session"]) > 0 + ): session_input = SessionInput( app_name=old_eval_case["initial_session"].get("app_name", ""), user_id=old_eval_case["initial_session"].get("user_id", ""), diff --git a/tests/integration/fixture/ecommerce_customer_service_agent/order_query.test.json b/tests/integration/fixture/ecommerce_customer_service_agent/order_query.test.json index ac424f3..6c215ad 100644 --- a/tests/integration/fixture/ecommerce_customer_service_agent/order_query.test.json +++ b/tests/integration/fixture/ecommerce_customer_service_agent/order_query.test.json @@ -1,69 +1,229 @@ -[ - { - "query": "Send an email to user user_a whose email address is alice@example.com", - "expected_tool_use": [ - { - "tool_name": "send_email", - "tool_input": { - "email": "alice@example.com", - "user_id": "user_a" +{ + "eval_set_id": "a1157c01-851f-48a8-b956-83cf7f463510", + "name": "a1157c01-851f-48a8-b956-83cf7f463510", + "description": null, + "eval_cases": [ + { + "eval_id": "tests/integration/fixture/ecommerce_customer_service_agent/order_query.test.json", + "conversation": [ + { + "invocation_id": "38d54523-d789-4873-8cc0-d38826c7feb4", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Send an email to user user_a whose email address is alice@example.com" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Email sent to alice@example.com for user id user_a." + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": null, + "args": { + "email": "alice@example.com", + "user_id": "user_a" + }, + "name": "send_email" + } + ], + "intermediate_responses": [] + }, + "creation_timestamp": 1747341706.6240807 + }, + { + "invocation_id": "916393ab-0bce-4cb0-98de-6573d4e8e25c", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Can you tell me the status of my order with ID 1?" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Your order with ID 1 is FINISHED." + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": null, + "args": { + "order_id": "1" + }, + "name": "get_order_status" + } + ], + "intermediate_responses": [] + }, + "creation_timestamp": 1747341706.6241167 + }, + { + "invocation_id": "511b23d9-56f9-423b-9c31-7626f3411c32", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Cancel all pending order for the user with user id user_a" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "I have checked your orders and order 4 was in pending status, so I have cancelled it. Order 1 was already finished and couldn't be cancelled.\n" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": null, + "args": { + "user_id": "user_a" + }, + "name": "get_order_ids_for_user" + }, + { + "id": null, + "args": { + "order_id": "1" + }, + "name": "get_order_status" + }, + { + "id": null, + "args": { + "order_id": "4" + }, + "name": "get_order_status" + }, + { + "id": null, + "args": { + "order_id": "4" + }, + "name": "cancel_order" + } + ], + "intermediate_responses": [] + }, + "creation_timestamp": 1747341706.6241703 + }, + { + "invocation_id": "dcdf4b6d-96dd-4602-8c14-0563c6f6b5d0", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "What orders have I placed under the username user_b?" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "User user_b has placed one order with order ID 2.\n" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": null, + "args": { + "user_id": "user_b" + }, + "name": "get_order_ids_for_user" + } + ], + "intermediate_responses": [] + }, + "creation_timestamp": 1747341706.624196 } - } - ], - "reference": "Email sent to alice@example.com for user id user_a." - }, - { - "query": "Can you tell me the status of my order with ID 1?", - "expected_tool_use": [ - { - "tool_name": "get_order_status", - "tool_input": { - "order_id": "1" - } - } - ], - "reference": "Your order with ID 1 is FINISHED." - }, - { - "query": "Cancel all pending order for the user with user id user_a", - "expected_tool_use": [ - { - "tool_name": "get_order_ids_for_user", - "tool_input": { - "user_id": "user_a" - } - }, - { - "tool_name": "get_order_status", - "tool_input": { - "order_id": "1" - } - }, - { - "tool_name": "get_order_status", - "tool_input": { - "order_id": "4" - } - }, - { - "tool_name": "cancel_order", - "tool_input": { - "order_id": "4" - } - } - ], - "reference": "I have checked your orders and order 4 was in pending status, so I have cancelled it. Order 1 was already finished and couldn't be cancelled.\n" - }, - { - "query": "What orders have I placed under the username user_b?", - "expected_tool_use": [ - { - "tool_name": "get_order_ids_for_user", - "tool_input": { - "user_id": "user_b" - } - } - ], - "reference": "User user_b has placed one order with order ID 2.\n" - } -] + ], + "session_input": null, + "creation_timestamp": 1747341706.6242023 + } + ], + "creation_timestamp": 1747341706.6242158 +} \ No newline at end of file diff --git a/tests/integration/fixture/hello_world_agent/roll_die.test.json b/tests/integration/fixture/hello_world_agent/roll_die.test.json index fdc8127..7c1e453 100644 --- a/tests/integration/fixture/hello_world_agent/roll_die.test.json +++ b/tests/integration/fixture/hello_world_agent/roll_die.test.json @@ -1,24 +1,143 @@ -[ - { - "query": "Hi who are you?", - "expected_tool_use": [], - "reference": "I am a data processing agent. I can roll dice and check if the results are prime numbers. What would you like me to do? \n" - }, - { - "query": "What can you do?", - "expected_tool_use": [], - "reference": "I can roll dice for you of different sizes, and I can check if the results are prime numbers. I can also remember previous rolls if you'd like to check those for primes as well. What would you like me to do? \n" - }, - { - "query": "Can you roll a die with 6 sides", - "expected_tool_use": [ - { - "tool_name": "roll_die", - "tool_input": { - "sides": 6 +{ + "eval_set_id": "56540925-a5ff-49fe-a4e1-589fe78066f2", + "name": "56540925-a5ff-49fe-a4e1-589fe78066f2", + "description": null, + "eval_cases": [ + { + "eval_id": "tests/integration/fixture/hello_world_agent/roll_die.test.json", + "conversation": [ + { + "invocation_id": "b01f67f0-9f23-44d6-bbe4-36ea235cb9fb", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Hi who are you?" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "I am a data processing agent. I can roll dice and check if the results are prime numbers. What would you like me to do? \n" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [], + "intermediate_responses": [] + }, + "creation_timestamp": 1747341775.8937013 + }, + { + "invocation_id": "13be0093-ac29-4828-98c6-5bbd570c010c", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "What can you do?" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "I can roll dice for you of different sizes, and I can check if the results are prime numbers. I can also remember previous rolls if you'd like to check those for primes as well. What would you like me to do? \n" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [], + "intermediate_responses": [] + }, + "creation_timestamp": 1747341775.8937378 + }, + { + "invocation_id": "7deda353-c936-4c21-b242-9fa75e45b6a7", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Can you roll a die with 6 sides" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": null + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": null, + "args": { + "sides": 6 + }, + "name": "roll_die" + } + ], + "intermediate_responses": [] + }, + "creation_timestamp": 1747341775.8937788 } - } - ], - "reference": null - } -] + ], + "session_input": null, + "creation_timestamp": 1747341775.8937826 + } + ], + "creation_timestamp": 1747341775.8937957 +} \ No newline at end of file diff --git a/tests/integration/fixture/home_automation_agent/simple_test.test.json b/tests/integration/fixture/home_automation_agent/simple_test.test.json index 978c36f..8e055dd 100644 --- a/tests/integration/fixture/home_automation_agent/simple_test.test.json +++ b/tests/integration/fixture/home_automation_agent/simple_test.test.json @@ -1,5 +1,65 @@ -[{ - "query": "Turn off device_2 in the Bedroom.", - "expected_tool_use": [{"tool_name": "set_device_info", "tool_input": {"location": "Bedroom", "device_id": "device_2", "status": "OFF"}}], - "reference": "I have set the device_2 status to off." -}] +{ + "eval_set_id": "b305bd06-38c5-4796-b9c7-d9c7454338b9", + "name": "b305bd06-38c5-4796-b9c7-d9c7454338b9", + "description": null, + "eval_cases": [ + { + "eval_id": "tests/integration/fixture/home_automation_agent/simple_test.test.json", + "conversation": [ + { + "invocation_id": "b7982664-0ab6-47cc-ab13-326656afdf75", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Turn off device_2 in the Bedroom." + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "I have set the device_2 status to off." + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": null, + "args": { + "location": "Bedroom", + "device_id": "device_2", + "status": "OFF" + }, + "name": "set_device_info" + } + ], + "intermediate_responses": [] + }, + "creation_timestamp": 1747337309.2360144 + } + ], + "session_input": null, + "creation_timestamp": 1747337309.2360282 + } + ], + "creation_timestamp": 1747337309.2360387 +} \ No newline at end of file diff --git a/tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json b/tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json index 0633eab..243c1dc 100644 --- a/tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json +++ b/tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json @@ -1,18 +1,113 @@ -[ - { - "query": "Turn off device_2 in the Bedroom.", - "expected_tool_use": [{ - "tool_name": "set_device_info", - "tool_input": {"location": "Bedroom", "status": "OFF", "device_id": "device_2"} - }], - "reference": "I have set the device 2 status to off." - }, - { - "query": "What's the status of device_2 in the Bedroom?", - "expected_tool_use": [{ - "tool_name": "get_device_info", - "tool_input": {"device_id": "device_2"} - }], - "reference": "Status of device_2 is off." - } -] +{ + "eval_set_id": "1be50511-ff75-4d68-b2d7-2165cbdc1044", + "name": "1be50511-ff75-4d68-b2d7-2165cbdc1044", + "description": null, + "eval_cases": [ + { + "eval_id": "tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json", + "conversation": [ + { + "invocation_id": "cbece1c0-3811-45c0-96fc-9a4279075483", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Turn off device_2 in the Bedroom." + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "I have set the device 2 status to off." + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": null, + "args": { + "location": "Bedroom", + "status": "OFF", + "device_id": "device_2" + }, + "name": "set_device_info" + } + ], + "intermediate_responses": [] + }, + "creation_timestamp": 1747340826.1082227 + }, + { + "invocation_id": "cc85cdae-4258-4b94-8fe7-a985b8356190", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "What's the status of device_2 in the Bedroom?" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Status of device_2 is off." + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": null, + "args": { + "device_id": "device_2" + }, + "name": "get_device_info" + } + ], + "intermediate_responses": [] + }, + "creation_timestamp": 1747340826.1082554 + } + ], + "session_input": null, + "creation_timestamp": 1747340826.108262 + } + ], + "creation_timestamp": 1747340826.108275 +} \ No newline at end of file diff --git a/tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json b/tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json index 0e5778b..612f3cd 100644 --- a/tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json +++ b/tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json @@ -1,17 +1,105 @@ -[ - { - "query": "Turn off device_2 in the Bedroom.", - "expected_tool_use": [ - { - "tool_name": "set_device_info", - "tool_input": {"location": "Bedroom", "device_id": "device_2", "status": "OFF"} - } +{ + "eval_set_id": "94553685-5f19-492b-bc44-f3bc775955e9", + "name": "94553685-5f19-492b-bc44-f3bc775955e9", + "description": null, + "eval_cases": [ + { + "eval_id": "tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json", + "conversation": [ + { + "invocation_id": "a958b622-21d3-4a6c-9c15-1274bbb8a6b6", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Turn off device_2 in the Bedroom." + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "OK. I've turned off device_2 in the Bedroom. Anything else?\n" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": null, + "args": { + "location": "Bedroom", + "device_id": "device_2", + "status": "OFF" + }, + "name": "set_device_info" + } + ], + "intermediate_responses": [] + }, + "creation_timestamp": 1747340865.7043095 + }, + { + "invocation_id": "1c07123d-4bed-4eb0-9e55-c7f80c70dadf", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "What's the command I just issued?" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "You asked me to turn off device_2 in the Bedroom.\n" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [], + "intermediate_responses": [] + }, + "creation_timestamp": 1747340865.7043421 + } ], - "reference": "OK. I've turned off device_2 in the Bedroom. Anything else?\n" - }, - { - "query": "What's the command I just issued?", - "expected_tool_use": [], - "reference": "You asked me to turn off device_2 in the Bedroom.\n" - } -] + "session_input": null, + "creation_timestamp": 1747340865.7043483 + } + ], + "creation_timestamp": 1747340865.704361 +} \ No newline at end of file diff --git a/tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json b/tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json index 334dd2d..dfe2b15 100644 --- a/tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json +++ b/tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json @@ -1,18 +1,115 @@ -[ +{ + "eval_set_id": "4412cca6-dfcd-43ab-bbc5-9155380c7137", + "name": "4412cca6-dfcd-43ab-bbc5-9155380c7137", + "description": null, + "eval_cases": [ { - "query": "Turn off device_2 in the Bedroom.", - "expected_tool_use": [{ - "tool_name": "set_device_info", - "tool_input": {"location": "Bedroom", "device_id": "device_2", "status": "OFF"} - }], - "reference": "I have set the device 2 status to off." - }, - { - "query": "Turn on device_2 in the Bedroom.", - "expected_tool_use": [{ - "tool_name": "set_device_info", - "tool_input": {"location": "Bedroom", "status": "ON", "device_id": "device_2"} - }], - "reference": "I have set the device 2 status to on." + "eval_id": "tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json", + "conversation": [ + { + "invocation_id": "9f51a1ac-56a4-4b4a-9878-36ff1ae312ce", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Turn off device_2 in the Bedroom." + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "I have set the device 2 status to off." + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": null, + "args": { + "location": "Bedroom", + "device_id": "device_2", + "status": "OFF" + }, + "name": "set_device_info" + } + ], + "intermediate_responses": [] + }, + "creation_timestamp": 1747340791.7353904 + }, + { + "invocation_id": "c82d54d0-5fa8-4f79-a6dc-692090f0d42b", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Turn on device_2 in the Bedroom." + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "I have set the device 2 status to on." + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": null, + "args": { + "location": "Bedroom", + "status": "ON", + "device_id": "device_2" + }, + "name": "set_device_info" + } + ], + "intermediate_responses": [] + }, + "creation_timestamp": 1747340791.7354295 + } + ], + "session_input": null, + "creation_timestamp": 1747340791.7354348 } -] + ], + "creation_timestamp": 1747340791.735446 +} \ No newline at end of file diff --git a/tests/integration/fixture/home_automation_agent/test_files/simple_test.test.json b/tests/integration/fixture/home_automation_agent/test_files/simple_test.test.json index 0e5778b..b324a11 100644 --- a/tests/integration/fixture/home_automation_agent/test_files/simple_test.test.json +++ b/tests/integration/fixture/home_automation_agent/test_files/simple_test.test.json @@ -1,17 +1,105 @@ -[ - { - "query": "Turn off device_2 in the Bedroom.", - "expected_tool_use": [ - { - "tool_name": "set_device_info", - "tool_input": {"location": "Bedroom", "device_id": "device_2", "status": "OFF"} - } +{ + "eval_set_id": "9100bfc9-cc28-4ab9-b920-2dc72e138997", + "name": "9100bfc9-cc28-4ab9-b920-2dc72e138997", + "description": null, + "eval_cases": [ + { + "eval_id": "tests/integration/fixture/home_automation_agent/test_files/simple_test.test.json", + "conversation": [ + { + "invocation_id": "9f5e8d91-8e51-41d6-addf-196a828168c5", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Turn off device_2 in the Bedroom." + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "OK. I've turned off device_2 in the Bedroom. Anything else?\n" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": null, + "args": { + "location": "Bedroom", + "device_id": "device_2", + "status": "OFF" + }, + "name": "set_device_info" + } + ], + "intermediate_responses": [] + }, + "creation_timestamp": 1747340849.0429707 + }, + { + "invocation_id": "767b2451-5f7b-4c73-aeaf-a82c71e15788", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "What's the command I just issued?" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "You asked me to turn off device_2 in the Bedroom.\n" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [], + "intermediate_responses": [] + }, + "creation_timestamp": 1747340849.0429986 + } ], - "reference": "OK. I've turned off device_2 in the Bedroom. Anything else?\n" - }, - { - "query": "What's the command I just issued?", - "expected_tool_use": [], - "reference": "You asked me to turn off device_2 in the Bedroom.\n" - } -] + "session_input": null, + "creation_timestamp": 1747340849.0430045 + } + ], + "creation_timestamp": 1747340849.0430162 +} \ No newline at end of file diff --git a/tests/integration/fixture/home_automation_agent/test_files/simple_test2.test.json b/tests/integration/fixture/home_automation_agent/test_files/simple_test2.test.json index 5ba5d82..6efb313 100644 --- a/tests/integration/fixture/home_automation_agent/test_files/simple_test2.test.json +++ b/tests/integration/fixture/home_automation_agent/test_files/simple_test2.test.json @@ -1,5 +1,65 @@ -[{ - "query": "Turn off device_3 in the Bedroom.", - "expected_tool_use": [{"tool_name": "set_device_info", "tool_input": {"location": "Bedroom", "device_id": "device_3", "status": "OFF"}}], - "reference": "I have set the device_3 status to off." -}] +{ + "eval_set_id": "e141f90b-9e7e-4f06-94d7-bbe7e8080ead", + "name": "e141f90b-9e7e-4f06-94d7-bbe7e8080ead", + "description": null, + "eval_cases": [ + { + "eval_id": "tests/integration/fixture/home_automation_agent/test_files/simple_test2.test.json", + "conversation": [ + { + "invocation_id": "c35582f7-838a-460f-b783-039e278165e0", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Turn off device_3 in the Bedroom." + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "I have set the device_3 status to off." + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": null, + "args": { + "location": "Bedroom", + "device_id": "device_3", + "status": "OFF" + }, + "name": "set_device_info" + } + ], + "intermediate_responses": [] + }, + "creation_timestamp": 1747340814.8645504 + } + ], + "session_input": null, + "creation_timestamp": 1747340814.86456 + } + ], + "creation_timestamp": 1747340814.864572 +} \ No newline at end of file diff --git a/tests/integration/fixture/trip_planner_agent/test_files/initial.session.json b/tests/integration/fixture/trip_planner_agent/test_files/initial.session.json deleted file mode 100644 index b33840c..0000000 --- a/tests/integration/fixture/trip_planner_agent/test_files/initial.session.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "id": "test_id", - "app_name": "trip_planner_agent", - "user_id": "test_user", - "state": { - "origin": "San Francisco", - "interests": "Food, Shopping, Museums", - "range": "1000 miles", - "cities": "" - }, - "events": [], - "last_update_time": 1741218714.258285 -} diff --git a/tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json b/tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json index 03f52ab..9fe7c6a 100644 --- a/tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json +++ b/tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json @@ -1,7 +1,64 @@ -[ - { - "query": "Based on my interests, where should I go, Yosemite national park or Los Angeles?", - "expected_tool_use": [], - "reference": "Given your interests in food, shopping, and museums, Los Angeles would be a better choice than Yosemite National Park. Yosemite is primarily focused on outdoor activities and natural landscapes, while Los Angeles offers a diverse range of culinary experiences, shopping districts, and world-class museums. I will now gather information to create an in-depth guide for your trip to Los Angeles.\n" - } -] +{ + "eval_set_id": "189d6856-9b90-4b9c-bda8-7cec899507ae", + "name": "189d6856-9b90-4b9c-bda8-7cec899507ae", + "description": null, + "eval_cases": [ + { + "eval_id": "tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json", + "conversation": [ + { + "invocation_id": "1c2e8003-d19c-4912-b0ae-17b9d568f8fb", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Based on my interests, where should I go, Yosemite national park or Los Angeles?" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "code_execution_result": null, + "executable_code": null, + "file_data": null, + "function_call": null, + "function_response": null, + "inline_data": null, + "text": "Given your interests in food, shopping, and museums, Los Angeles would be a better choice than Yosemite National Park. Yosemite is primarily focused on outdoor activities and natural landscapes, while Los Angeles offers a diverse range of culinary experiences, shopping districts, and world-class museums. I will now gather information to create an in-depth guide for your trip to Los Angeles.\n" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [], + "intermediate_responses": [] + }, + "creation_timestamp": 1747339378.484014 + } + ], + "session_input": { + "app_name": "trip_planner_agent", + "user_id": "test_user", + "state": { + "origin": "San Francisco", + "interests": "Food, Shopping, Museums", + "range": "1000 miles", + "cities": "" + } + }, + "creation_timestamp": 1747339378.484044 + } + ], + "creation_timestamp": 1747339378.484056 +} \ No newline at end of file diff --git a/tests/integration/test_evalute_agent_in_fixture.py b/tests/integration/test_evalute_agent_in_fixture.py index 8f9b77f..4fdeed9 100644 --- a/tests/integration/test_evalute_agent_in_fixture.py +++ b/tests/integration/test_evalute_agent_in_fixture.py @@ -32,15 +32,9 @@ def agent_eval_artifacts_in_fixture(): # Evaluation test files end with test.json if not filename.endswith('test.json'): continue - initial_session_file = ( - f'tests/integration/fixture/{agent_name}/initial.session.json' - ) agent_eval_artifacts.append(( f'tests.integration.fixture.{agent_name}', f'tests/integration/fixture/{agent_name}/{filename}', - initial_session_file - if os.path.exists(initial_session_file) - else None, )) # This method gets invoked twice, sorting helps ensure that both the @@ -53,12 +47,12 @@ def agent_eval_artifacts_in_fixture(): @pytest.mark.asyncio @pytest.mark.parametrize( - 'agent_name, evalfile, initial_session_file', + 'agent_name, evalfile', agent_eval_artifacts_in_fixture(), - ids=[agent_name for agent_name, _, _ in agent_eval_artifacts_in_fixture()], + ids=[agent_name for agent_name, _ in agent_eval_artifacts_in_fixture()], ) async def test_evaluate_agents_long_running_4_runs_per_eval_item( - agent_name, evalfile, initial_session_file + agent_name, evalfile ): """Test agents evaluation in fixture folder. @@ -70,7 +64,6 @@ async def test_evaluate_agents_long_running_4_runs_per_eval_item( await AgentEvaluator.evaluate( agent_module=agent_name, eval_dataset_file_path_or_dir=evalfile, - initial_session_file=initial_session_file, # Using a slightly higher value helps us manange the variances that may # happen in each eval. # This, of course, comes at a cost of incrased test run times. diff --git a/tests/integration/test_multi_agent.py b/tests/integration/test_multi_agent.py index 2d0ccab..3d161a9 100644 --- a/tests/integration/test_multi_agent.py +++ b/tests/integration/test_multi_agent.py @@ -18,13 +18,10 @@ import pytest @pytest.mark.asyncio async def test_eval_agent(): - AgentEvaluator.evaluate( + await AgentEvaluator.evaluate( agent_module="tests.integration.fixture.trip_planner_agent", eval_dataset_file_path_or_dir=( "tests/integration/fixture/trip_planner_agent/trip_inquiry.test.json" ), - initial_session_file=( - "tests/integration/fixture/trip_planner_agent/initial.session.json" - ), num_runs=4, ) diff --git a/tests/integration/test_multi_turn.py b/tests/integration/test_multi_turn.py index 82ca746..5e300a7 100644 --- a/tests/integration/test_multi_turn.py +++ b/tests/integration/test_multi_turn.py @@ -19,7 +19,7 @@ import pytest @pytest.mark.asyncio async def test_simple_multi_turn_conversation(): """Test a simple multi-turn conversation.""" - AgentEvaluator.evaluate( + await AgentEvaluator.evaluate( agent_module="tests.integration.fixture.home_automation_agent", eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json", num_runs=4, @@ -29,7 +29,7 @@ async def test_simple_multi_turn_conversation(): @pytest.mark.asyncio async def test_dependent_tool_calls(): """Test subsequent tool calls that are dependent on previous tool calls.""" - AgentEvaluator.evaluate( + await AgentEvaluator.evaluate( agent_module="tests.integration.fixture.home_automation_agent", eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json", num_runs=4, @@ -39,8 +39,7 @@ async def test_dependent_tool_calls(): @pytest.mark.asyncio async def test_memorizing_past_events(): """Test memorizing past events.""" - - AgentEvaluator.evaluate( + await AgentEvaluator.evaluate( agent_module="tests.integration.fixture.home_automation_agent", eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json", num_runs=4, diff --git a/tests/integration/test_sub_agent.py b/tests/integration/test_sub_agent.py index b3d89b3..cbfb90b 100644 --- a/tests/integration/test_sub_agent.py +++ b/tests/integration/test_sub_agent.py @@ -19,10 +19,9 @@ import pytest @pytest.mark.asyncio async def test_eval_agent(): """Test hotel sub agent in a multi-agent system.""" - AgentEvaluator.evaluate( + await AgentEvaluator.evaluate( agent_module="tests.integration.fixture.trip_planner_agent", eval_dataset_file_path_or_dir="tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json", - initial_session_file="tests/integration/fixture/trip_planner_agent/test_files/initial.session.json", agent_name="identify_agent", num_runs=4, ) diff --git a/tests/integration/test_with_test_file.py b/tests/integration/test_with_test_file.py index 664bdbf..d19428f 100644 --- a/tests/integration/test_with_test_file.py +++ b/tests/integration/test_with_test_file.py @@ -19,7 +19,7 @@ import pytest @pytest.mark.asyncio async def test_with_single_test_file(): """Test the agent's basic ability via session file.""" - AgentEvaluator.evaluate( + await AgentEvaluator.evaluate( agent_module="tests.integration.fixture.home_automation_agent", eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/simple_test.test.json", ) @@ -28,7 +28,7 @@ async def test_with_single_test_file(): @pytest.mark.asyncio async def test_with_folder_of_test_files_long_running(): """Test the agent's basic ability via a folder of session files.""" - AgentEvaluator.evaluate( + await AgentEvaluator.evaluate( agent_module="tests.integration.fixture.home_automation_agent", eval_dataset_file_path_or_dir=( "tests/integration/fixture/home_automation_agent/test_files"