Updated test cases to use the new EvalSet schema to store test data. Also, added a utility to help migrate existing tests files to the new schema.

Also, migrated existing test files to the new schema and deleted test session files as they are no longer needed. PiperOrigin-RevId: 759318735
2025-07-14 01:41:25 -06:00 · 2025-05-15 15:09:30 -07:00 · 2025-05-15 15:09:30 -07:00 · 1c23556225
commit 1c23556225
parent a71d9ea9a1
17 changed files with 1110 additions and 231 deletions
--- a/src/google/adk/evaluation/agent_evaluator.py
+++ b/src/google/adk/evaluation/agent_evaluator.py
@ -13,12 +13,18 @@
 # limitations under the License.

 import json
+import logging
 import os
 from os import path
+from typing import Any
 from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Union
 import uuid
+
+from pydantic import ValidationError
+
 from .eval_set import EvalSet
 from .evaluation_generator import EvaluationGenerator
 from .evaluator import EvalStatus
@ -28,6 +34,9 @@ from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
 from .response_evaluator import ResponseEvaluator
 from .trajectory_evaluator import TrajectoryEvaluator

+logger = logging.getLogger(__name__)
+
+
 # Constants for default runs and evaluation criteria
 NUM_RUNS = 2
 TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
@ -131,18 +140,17 @@ class AgentEvaluator:
        )

        assert evaluation_result.overall_eval_status == EvalStatus.PASSED, (
-            f"`{eval_case_responses.eval_case.eval_id}`: "
            f"{metric_name} for {agent_module} Failed. Expected {threshold},"
            f" but got {evaluation_result.overall_score}."
        )

  @staticmethod
  async def evaluate(
-      agent_module,
-      eval_dataset_file_path_or_dir,
-      num_runs=NUM_RUNS,
-      agent_name=None,
-      initial_session_file=None,
+      agent_module: str,
+      eval_dataset_file_path_or_dir: str,
+      num_runs: int = NUM_RUNS,
+      agent_name: Optional[str] = None,
+      initial_session_file: Optional[str] = None,
  ):
    """Evaluates an Agent given eval data.

@ -170,25 +178,14 @@ class AgentEvaluator:
    else:
      test_files = [eval_dataset_file_path_or_dir]

-    initial_session = {}
-    if initial_session_file:
-      with open(initial_session_file, "r") as f:
-        initial_session = json.loads(f.read())
+    initial_session = AgentEvaluator._get_initial_session(initial_session_file)

    for test_file in test_files:
-      data = AgentEvaluator._load_dataset(test_file)[0]
      criteria = AgentEvaluator.find_config_for_test_file(test_file)
-      AgentEvaluator._validate_input([data], criteria)
-
-      eval_data = {
-          "name": test_file,
-          "data": data,
-          "initial_session": initial_session,
-      }
-
-      eval_set = convert_eval_set_to_pydanctic_schema(
-          eval_set_id=str(uuid.uuid4()), eval_set_in_json_format=[eval_data]
+      eval_set = AgentEvaluator._load_eval_set_from_file(
+          test_file, criteria, initial_session
      )
+
      await AgentEvaluator.evaluate_eval_set(
          agent_module=agent_module,
          eval_set=eval_set,
@ -197,6 +194,86 @@ class AgentEvaluator:
          agent_name=agent_name,
      )

+  @staticmethod
+  def migrate_eval_data_to_new_schema(
+      old_eval_data_file: str,
+      new_eval_data_file: str,
+      initial_session_file: Optional[str] = None,
+  ):
+    """A utility for migrating eval data to new schema backed by EvalSet."""
+    if not old_eval_data_file or not new_eval_data_file:
+      raise ValueError(
+          "One of old_eval_data_file or new_eval_data_file is empty."
+      )
+
+    criteria = AgentEvaluator.find_config_for_test_file(old_eval_data_file)
+    initial_session = AgentEvaluator._get_initial_session(initial_session_file)
+
+    eval_set = AgentEvaluator._get_eval_set_from_old_format(
+        old_eval_data_file, criteria, initial_session
+    )
+
+    with open(new_eval_data_file, "w") as f:
+      f.write(eval_set.model_dump_json(indent=2))
+
+  @staticmethod
+  def _load_eval_set_from_file(
+      eval_set_file: str,
+      criteria: dict[str, float],
+      initial_session: dict[str, Any],
+  ) -> EvalSet:
+    """Loads an EvalSet from the given file."""
+    if os.path.isfile(eval_set_file):
+      with open(eval_set_file, "r", encoding="utf-8") as f:
+        content = f.read()
+
+      try:
+        eval_set = EvalSet.model_validate_json(content)
+        assert len(initial_session) == 0, (
+            "Intial session should be specified as a part of EvalSet file."
+            " Explicit initial session is only needed, when specifying data in"
+            " the older schema."
+        )
+        return eval_set
+      except ValidationError:
+        # We assume that the eval data was specified in the old format
+        logger.warning(
+            f"Contents of {eval_set_file} appear to be in older format.To avoid"
+            " this warning, please update your test files to contain data in"
+            " EvalSet schema. You can use `migrate_eval_data_to_new_schema`"
+            " for migrating your old test files."
+        )
+
+    # If we are here, the data must be specified in the older format.
+    return AgentEvaluator._get_eval_set_from_old_format(
+        eval_set_file, criteria, initial_session
+    )
+
+  @staticmethod
+  def _get_eval_set_from_old_format(
+      eval_set_file: str,
+      criteria: dict[str, float],
+      initial_session: dict[str, Any],
+  ) -> EvalSet:
+    data = AgentEvaluator._load_dataset(eval_set_file)[0]
+    AgentEvaluator._validate_input([data], criteria)
+    eval_data = {
+        "name": eval_set_file,
+        "data": data,
+        "initial_session": initial_session,
+    }
+    return convert_eval_set_to_pydanctic_schema(
+        eval_set_id=str(uuid.uuid4()), eval_set_in_json_format=[eval_data]
+    )
+
+  @staticmethod
+  def _get_initial_session(initial_session_file: Optional[str] = None):
+    initial_session = {}
+    if initial_session_file:
+      with open(initial_session_file, "r") as f:
+        initial_session = json.loads(f.read())
+    return initial_session
+
  @staticmethod
  def _load_dataset(
      input_data: Union[str, List[str], List[Dict], List[List[Dict]]],
--- a/src/google/adk/evaluation/local_eval_sets_manager.py
+++ b/src/google/adk/evaluation/local_eval_sets_manager.py
@ -135,7 +135,10 @@ def convert_eval_set_to_pydanctic_schema(
      )

    session_input = None
-    if "initial_session" in old_eval_case:
+    if (
+        "initial_session" in old_eval_case
+        and len(old_eval_case["initial_session"]) > 0
+    ):
      session_input = SessionInput(
          app_name=old_eval_case["initial_session"].get("app_name", ""),
          user_id=old_eval_case["initial_session"].get("user_id", ""),
--- a/tests/integration/fixture/ecommerce_customer_service_agent/order_query.test.json
+++ b/tests/integration/fixture/ecommerce_customer_service_agent/order_query.test.json
@ -1,69 +1,229 @@
-[
+{
+  "eval_set_id": "a1157c01-851f-48a8-b956-83cf7f463510",
+  "name": "a1157c01-851f-48a8-b956-83cf7f463510",
+  "description": null,
+  "eval_cases": [
    {
-    "query": "Send an email to user user_a whose email address is alice@example.com",
-    "expected_tool_use": [
+      "eval_id": "tests/integration/fixture/ecommerce_customer_service_agent/order_query.test.json",
+      "conversation": [
        {
-        "tool_name": "send_email",
-        "tool_input": {
+          "invocation_id": "38d54523-d789-4873-8cc0-d38826c7feb4",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Send an email to user user_a whose email address is alice@example.com"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Email sent to alice@example.com for user id user_a."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
                  "email": "alice@example.com",
                  "user_id": "user_a"
-        }
+                },
+                "name": "send_email"
              }
            ],
-    "reference": "Email sent to alice@example.com for user id user_a."
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747341706.6240807
        },
        {
-    "query": "Can you tell me the status of my order with ID 1?",
-    "expected_tool_use": [
+          "invocation_id": "916393ab-0bce-4cb0-98de-6573d4e8e25c",
+          "user_content": {
+            "parts": [
              {
-        "tool_name": "get_order_status",
-        "tool_input": {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Can you tell me the status of my order with ID 1?"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Your order with ID 1 is FINISHED."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
                  "order_id": "1"
-        }
+                },
+                "name": "get_order_status"
              }
            ],
-    "reference": "Your order with ID 1 is FINISHED."
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747341706.6241167
        },
        {
-    "query": "Cancel all pending order for the user with user id user_a",
-    "expected_tool_use": [
+          "invocation_id": "511b23d9-56f9-423b-9c31-7626f3411c32",
+          "user_content": {
+            "parts": [
              {
-        "tool_name": "get_order_ids_for_user",
-        "tool_input": {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Cancel all pending order for the user with user id user_a"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I have checked your orders and order 4 was in pending status, so I have cancelled it. Order 1 was already finished and couldn't be cancelled.\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
                  "user_id": "user_a"
-        }
+                },
+                "name": "get_order_ids_for_user"
              },
              {
-        "tool_name": "get_order_status",
-        "tool_input": {
+                "id": null,
+                "args": {
                  "order_id": "1"
-        }
+                },
+                "name": "get_order_status"
              },
              {
-        "tool_name": "get_order_status",
-        "tool_input": {
+                "id": null,
+                "args": {
                  "order_id": "4"
-        }
+                },
+                "name": "get_order_status"
              },
              {
-        "tool_name": "cancel_order",
-        "tool_input": {
+                "id": null,
+                "args": {
                  "order_id": "4"
-        }
+                },
+                "name": "cancel_order"
              }
            ],
-    "reference": "I have checked your orders and order 4 was in pending status, so I have cancelled it. Order 1 was already finished and couldn't be cancelled.\n"
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747341706.6241703
        },
        {
-    "query": "What orders have I placed under the username user_b?",
-    "expected_tool_use": [
+          "invocation_id": "dcdf4b6d-96dd-4602-8c14-0563c6f6b5d0",
+          "user_content": {
+            "parts": [
              {
-        "tool_name": "get_order_ids_for_user",
-        "tool_input": {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "What orders have I placed under the username user_b?"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "User user_b has placed one order with order ID 2.\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
                  "user_id": "user_b"
-        }
+                },
+                "name": "get_order_ids_for_user"
              }
            ],
-    "reference": "User user_b has placed one order with order ID 2.\n"
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747341706.624196
        }
-]
+      ],
+      "session_input": null,
+      "creation_timestamp": 1747341706.6242023
+    }
+  ],
+  "creation_timestamp": 1747341706.6242158
+}
--- a/tests/integration/fixture/hello_world_agent/roll_die.test.json
+++ b/tests/integration/fixture/hello_world_agent/roll_die.test.json
@ -1,24 +1,143 @@
-[
+{
+  "eval_set_id": "56540925-a5ff-49fe-a4e1-589fe78066f2",
+  "name": "56540925-a5ff-49fe-a4e1-589fe78066f2",
+  "description": null,
+  "eval_cases": [
    {
-    "query": "Hi who are you?",
-    "expected_tool_use": [],
-    "reference": "I am a data processing agent. I can roll dice and check if the results are prime numbers. What would you like me to do? \n"
-  },
+      "eval_id": "tests/integration/fixture/hello_world_agent/roll_die.test.json",
+      "conversation": [
        {
-    "query": "What can you do?",
-    "expected_tool_use": [],
-    "reference": "I can roll dice for you of different sizes, and I can check if the results are prime numbers.  I can also remember previous rolls if you'd like to check those for primes as well.  What would you like me to do? \n"
-  },
+          "invocation_id": "b01f67f0-9f23-44d6-bbe4-36ea235cb9fb",
+          "user_content": {
+            "parts": [
              {
-    "query": "Can you roll a die with 6 sides",
-    "expected_tool_use": [
-      {
-        "tool_name": "roll_die",
-        "tool_input": {
-          "sides": 6
-        }
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Hi who are you?"
              }
            ],
-    "reference": null
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I am a data processing agent. I can roll dice and check if the results are prime numbers. What would you like me to do? \n"
              }
-]
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747341775.8937013
+        },
+        {
+          "invocation_id": "13be0093-ac29-4828-98c6-5bbd570c010c",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "What can you do?"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I can roll dice for you of different sizes, and I can check if the results are prime numbers.  I can also remember previous rolls if you'd like to check those for primes as well.  What would you like me to do? \n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747341775.8937378
+        },
+        {
+          "invocation_id": "7deda353-c936-4c21-b242-9fa75e45b6a7",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Can you roll a die with 6 sides"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": null
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "sides": 6
+                },
+                "name": "roll_die"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747341775.8937788
+        }
+      ],
+      "session_input": null,
+      "creation_timestamp": 1747341775.8937826
+    }
+  ],
+  "creation_timestamp": 1747341775.8937957
+}
--- a/tests/integration/fixture/home_automation_agent/simple_test.test.json
+++ b/tests/integration/fixture/home_automation_agent/simple_test.test.json
@ -1,5 +1,65 @@
-[{
-  "query": "Turn off device_2 in the Bedroom.",
-  "expected_tool_use": [{"tool_name": "set_device_info", "tool_input": {"location": "Bedroom", "device_id": "device_2", "status": "OFF"}}],
-  "reference": "I have set the device_2 status to off."
-}]
+{
+  "eval_set_id": "b305bd06-38c5-4796-b9c7-d9c7454338b9",
+  "name": "b305bd06-38c5-4796-b9c7-d9c7454338b9",
+  "description": null,
+  "eval_cases": [
+    {
+      "eval_id": "tests/integration/fixture/home_automation_agent/simple_test.test.json",
+      "conversation": [
+        {
+          "invocation_id": "b7982664-0ab6-47cc-ab13-326656afdf75",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Turn off device_2 in the Bedroom."
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I have set the device_2 status to off."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "location": "Bedroom",
+                  "device_id": "device_2",
+                  "status": "OFF"
+                },
+                "name": "set_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747337309.2360144
+        }
+      ],
+      "session_input": null,
+      "creation_timestamp": 1747337309.2360282
+    }
+  ],
+  "creation_timestamp": 1747337309.2360387
+}
--- a/tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json
+++ b/tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json
@ -1,18 +1,113 @@
-[
+{
+  "eval_set_id": "1be50511-ff75-4d68-b2d7-2165cbdc1044",
+  "name": "1be50511-ff75-4d68-b2d7-2165cbdc1044",
+  "description": null,
+  "eval_cases": [
    {
-      "query": "Turn off device_2 in the Bedroom.",
-      "expected_tool_use": [{
-          "tool_name": "set_device_info",
-          "tool_input": {"location": "Bedroom", "status": "OFF", "device_id": "device_2"}
-      }],
-      "reference": "I have set the device 2 status to off."
+      "eval_id": "tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json",
+      "conversation": [
+        {
+          "invocation_id": "cbece1c0-3811-45c0-96fc-9a4279075483",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Turn off device_2 in the Bedroom."
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I have set the device 2 status to off."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "location": "Bedroom",
+                  "status": "OFF",
+                  "device_id": "device_2"
+                },
+                "name": "set_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340826.1082227
        },
        {
-      "query": "What's the status of device_2 in the Bedroom?",
-      "expected_tool_use": [{
-          "tool_name": "get_device_info",
-          "tool_input": {"device_id": "device_2"}
-      }],
-      "reference": "Status of device_2 is off."
+          "invocation_id": "cc85cdae-4258-4b94-8fe7-a985b8356190",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "What's the status of device_2 in the Bedroom?"
              }
-]
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Status of device_2 is off."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "device_id": "device_2"
+                },
+                "name": "get_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340826.1082554
+        }
+      ],
+      "session_input": null,
+      "creation_timestamp": 1747340826.108262
+    }
+  ],
+  "creation_timestamp": 1747340826.108275
+}
--- a/tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json
+++ b/tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json
@ -1,17 +1,105 @@
-[
+{
+  "eval_set_id": "94553685-5f19-492b-bc44-f3bc775955e9",
+  "name": "94553685-5f19-492b-bc44-f3bc775955e9",
+  "description": null,
+  "eval_cases": [
    {
-      "query": "Turn off device_2 in the Bedroom.",
-      "expected_tool_use": [
+      "eval_id": "tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json",
+      "conversation": [
        {
-              "tool_name": "set_device_info",
-              "tool_input": {"location": "Bedroom", "device_id": "device_2", "status": "OFF"}
+          "invocation_id": "a958b622-21d3-4a6c-9c15-1274bbb8a6b6",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Turn off device_2 in the Bedroom."
              }
            ],
-      "reference": "OK. I've turned off device_2 in the Bedroom. Anything else?\n"
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "OK. I've turned off device_2 in the Bedroom. Anything else?\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "location": "Bedroom",
+                  "device_id": "device_2",
+                  "status": "OFF"
+                },
+                "name": "set_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340865.7043095
        },
        {
-      "query": "What's the command I just issued?",
-      "expected_tool_use": [],
-      "reference": "You asked me to turn off device_2 in the Bedroom.\n"
+          "invocation_id": "1c07123d-4bed-4eb0-9e55-c7f80c70dadf",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "What's the command I just issued?"
              }
-]
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "You asked me to turn off device_2 in the Bedroom.\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340865.7043421
+        }
+      ],
+      "session_input": null,
+      "creation_timestamp": 1747340865.7043483
+    }
+  ],
+  "creation_timestamp": 1747340865.704361
+}
--- a/tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json
+++ b/tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json
@ -1,18 +1,115 @@
-[
+{
+  "eval_set_id": "4412cca6-dfcd-43ab-bbc5-9155380c7137",
+  "name": "4412cca6-dfcd-43ab-bbc5-9155380c7137",
+  "description": null,
+  "eval_cases": [
    {
-        "query": "Turn off device_2 in the Bedroom.",
-        "expected_tool_use": [{
-            "tool_name": "set_device_info",
-            "tool_input": {"location": "Bedroom", "device_id": "device_2", "status": "OFF"}
-        }],
-        "reference": "I have set the device 2 status to off."
+      "eval_id": "tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json",
+      "conversation": [
+        {
+          "invocation_id": "9f51a1ac-56a4-4b4a-9878-36ff1ae312ce",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Turn off device_2 in the Bedroom."
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I have set the device 2 status to off."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "location": "Bedroom",
+                  "device_id": "device_2",
+                  "status": "OFF"
+                },
+                "name": "set_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340791.7353904
        },
        {
-        "query": "Turn on device_2 in the Bedroom.",
-        "expected_tool_use": [{
-            "tool_name": "set_device_info",
-            "tool_input": {"location": "Bedroom", "status": "ON", "device_id": "device_2"}
-        }],
-        "reference": "I have set the device 2 status to on."
+          "invocation_id": "c82d54d0-5fa8-4f79-a6dc-692090f0d42b",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Turn on device_2 in the Bedroom."
              }
-]
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I have set the device 2 status to on."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "location": "Bedroom",
+                  "status": "ON",
+                  "device_id": "device_2"
+                },
+                "name": "set_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340791.7354295
+        }
+      ],
+      "session_input": null,
+      "creation_timestamp": 1747340791.7354348
+    }
+  ],
+  "creation_timestamp": 1747340791.735446
+}
--- a/tests/integration/fixture/home_automation_agent/test_files/simple_test.test.json
+++ b/tests/integration/fixture/home_automation_agent/test_files/simple_test.test.json
@ -1,17 +1,105 @@
-[
+{
+  "eval_set_id": "9100bfc9-cc28-4ab9-b920-2dc72e138997",
+  "name": "9100bfc9-cc28-4ab9-b920-2dc72e138997",
+  "description": null,
+  "eval_cases": [
    {
-      "query": "Turn off device_2 in the Bedroom.",
-      "expected_tool_use": [
+      "eval_id": "tests/integration/fixture/home_automation_agent/test_files/simple_test.test.json",
+      "conversation": [
        {
-              "tool_name": "set_device_info",
-              "tool_input": {"location": "Bedroom", "device_id": "device_2", "status": "OFF"}
+          "invocation_id": "9f5e8d91-8e51-41d6-addf-196a828168c5",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Turn off device_2 in the Bedroom."
              }
            ],
-      "reference": "OK. I've turned off device_2 in the Bedroom. Anything else?\n"
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "OK. I've turned off device_2 in the Bedroom. Anything else?\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "location": "Bedroom",
+                  "device_id": "device_2",
+                  "status": "OFF"
+                },
+                "name": "set_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340849.0429707
        },
        {
-      "query": "What's the command I just issued?",
-      "expected_tool_use": [],
-      "reference": "You asked me to turn off device_2 in the Bedroom.\n"
+          "invocation_id": "767b2451-5f7b-4c73-aeaf-a82c71e15788",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "What's the command I just issued?"
              }
-]
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "You asked me to turn off device_2 in the Bedroom.\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340849.0429986
+        }
+      ],
+      "session_input": null,
+      "creation_timestamp": 1747340849.0430045
+    }
+  ],
+  "creation_timestamp": 1747340849.0430162
+}
--- a/tests/integration/fixture/home_automation_agent/test_files/simple_test2.test.json
+++ b/tests/integration/fixture/home_automation_agent/test_files/simple_test2.test.json
@ -1,5 +1,65 @@
-[{
-  "query": "Turn off device_3 in the Bedroom.",
-  "expected_tool_use": [{"tool_name": "set_device_info", "tool_input": {"location": "Bedroom", "device_id": "device_3", "status": "OFF"}}],
-  "reference": "I have set the device_3 status to off."
-}]
+{
+  "eval_set_id": "e141f90b-9e7e-4f06-94d7-bbe7e8080ead",
+  "name": "e141f90b-9e7e-4f06-94d7-bbe7e8080ead",
+  "description": null,
+  "eval_cases": [
+    {
+      "eval_id": "tests/integration/fixture/home_automation_agent/test_files/simple_test2.test.json",
+      "conversation": [
+        {
+          "invocation_id": "c35582f7-838a-460f-b783-039e278165e0",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Turn off device_3 in the Bedroom."
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I have set the device_3 status to off."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "location": "Bedroom",
+                  "device_id": "device_3",
+                  "status": "OFF"
+                },
+                "name": "set_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340814.8645504
+        }
+      ],
+      "session_input": null,
+      "creation_timestamp": 1747340814.86456
+    }
+  ],
+  "creation_timestamp": 1747340814.864572
+}
--- a/tests/integration/fixture/trip_planner_agent/test_files/initial.session.json
+++ b/tests/integration/fixture/trip_planner_agent/test_files/initial.session.json
@ -1,13 +0,0 @@
-{
-  "id": "test_id",
-  "app_name": "trip_planner_agent",
-  "user_id": "test_user",
-  "state": {
-    "origin": "San Francisco",
-    "interests": "Food, Shopping, Museums",
-    "range": "1000 miles",
-    "cities": ""
-  },
-  "events": [],
-  "last_update_time": 1741218714.258285
-}
--- a/tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json
+++ b/tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json
@ -1,7 +1,64 @@
-[
+{
+  "eval_set_id": "189d6856-9b90-4b9c-bda8-7cec899507ae",
+  "name": "189d6856-9b90-4b9c-bda8-7cec899507ae",
+  "description": null,
+  "eval_cases": [
    {
-    "query": "Based on my interests, where should I go, Yosemite national park or Los Angeles?",
-    "expected_tool_use": [],
-    "reference": "Given your interests in food, shopping, and museums, Los Angeles would be a better choice than Yosemite National Park. Yosemite is primarily focused on outdoor activities and natural landscapes, while Los Angeles offers a diverse range of culinary experiences, shopping districts, and world-class museums. I will now gather information to create an in-depth guide for your trip to Los Angeles.\n"
+      "eval_id": "tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json",
+      "conversation": [
+        {
+          "invocation_id": "1c2e8003-d19c-4912-b0ae-17b9d568f8fb",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Based on my interests, where should I go, Yosemite national park or Los Angeles?"
              }
-]
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Given your interests in food, shopping, and museums, Los Angeles would be a better choice than Yosemite National Park. Yosemite is primarily focused on outdoor activities and natural landscapes, while Los Angeles offers a diverse range of culinary experiences, shopping districts, and world-class museums. I will now gather information to create an in-depth guide for your trip to Los Angeles.\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747339378.484014
+        }
+      ],
+      "session_input": {
+        "app_name": "trip_planner_agent",
+        "user_id": "test_user",
+        "state": {
+          "origin": "San Francisco",
+          "interests": "Food, Shopping, Museums",
+          "range": "1000 miles",
+          "cities": ""
+        }
+      },
+      "creation_timestamp": 1747339378.484044
+    }
+  ],
+  "creation_timestamp": 1747339378.484056
+}
--- a/tests/integration/test_evalute_agent_in_fixture.py
+++ b/tests/integration/test_evalute_agent_in_fixture.py
@ -32,15 +32,9 @@ def agent_eval_artifacts_in_fixture():
      # Evaluation test files end with test.json
      if not filename.endswith('test.json'):
        continue
-      initial_session_file = (
-          f'tests/integration/fixture/{agent_name}/initial.session.json'
-      )
      agent_eval_artifacts.append((
          f'tests.integration.fixture.{agent_name}',
          f'tests/integration/fixture/{agent_name}/{filename}',
-          initial_session_file
-          if os.path.exists(initial_session_file)
-          else None,
      ))

  # This method gets invoked twice, sorting helps ensure that both the
@ -53,12 +47,12 @@ def agent_eval_artifacts_in_fixture():

@pytest.mark.asyncio
@pytest.mark.parametrize(
-    'agent_name, evalfile, initial_session_file',
+    'agent_name, evalfile',
    agent_eval_artifacts_in_fixture(),
-    ids=[agent_name for agent_name, _, _ in agent_eval_artifacts_in_fixture()],
+    ids=[agent_name for agent_name, _ in agent_eval_artifacts_in_fixture()],
 )
 async def test_evaluate_agents_long_running_4_runs_per_eval_item(
-    agent_name, evalfile, initial_session_file
+    agent_name, evalfile
 ):
  """Test agents evaluation in fixture folder.

@ -70,7 +64,6 @@ async def test_evaluate_agents_long_running_4_runs_per_eval_item(
  await AgentEvaluator.evaluate(
      agent_module=agent_name,
      eval_dataset_file_path_or_dir=evalfile,
-      initial_session_file=initial_session_file,
      # Using a slightly higher value helps us manange the variances that may
      # happen in each eval.
      # This, of course, comes at a cost of incrased test run times.
--- a/tests/integration/test_multi_agent.py
+++ b/tests/integration/test_multi_agent.py
@ -18,13 +18,10 @@ import pytest

@pytest.mark.asyncio
 async def test_eval_agent():
-  AgentEvaluator.evaluate(
+  await AgentEvaluator.evaluate(
      agent_module="tests.integration.fixture.trip_planner_agent",
      eval_dataset_file_path_or_dir=(
          "tests/integration/fixture/trip_planner_agent/trip_inquiry.test.json"
      ),
-      initial_session_file=(
-          "tests/integration/fixture/trip_planner_agent/initial.session.json"
-      ),
      num_runs=4,
  )
--- a/tests/integration/test_multi_turn.py
+++ b/tests/integration/test_multi_turn.py
@ -19,7 +19,7 @@ import pytest
@pytest.mark.asyncio
 async def test_simple_multi_turn_conversation():
  """Test a simple multi-turn conversation."""
-  AgentEvaluator.evaluate(
+  await AgentEvaluator.evaluate(
      agent_module="tests.integration.fixture.home_automation_agent",
      eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json",
      num_runs=4,
@ -29,7 +29,7 @@ async def test_simple_multi_turn_conversation():
@pytest.mark.asyncio
 async def test_dependent_tool_calls():
  """Test subsequent tool calls that are dependent on previous tool calls."""
-  AgentEvaluator.evaluate(
+  await AgentEvaluator.evaluate(
      agent_module="tests.integration.fixture.home_automation_agent",
      eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json",
      num_runs=4,
@ -39,8 +39,7 @@ async def test_dependent_tool_calls():
@pytest.mark.asyncio
 async def test_memorizing_past_events():
  """Test memorizing past events."""
-
-  AgentEvaluator.evaluate(
+  await AgentEvaluator.evaluate(
      agent_module="tests.integration.fixture.home_automation_agent",
      eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json",
      num_runs=4,
--- a/tests/integration/test_sub_agent.py
+++ b/tests/integration/test_sub_agent.py
@ -19,10 +19,9 @@ import pytest
@pytest.mark.asyncio
 async def test_eval_agent():
  """Test hotel sub agent in a multi-agent system."""
-  AgentEvaluator.evaluate(
+  await AgentEvaluator.evaluate(
      agent_module="tests.integration.fixture.trip_planner_agent",
      eval_dataset_file_path_or_dir="tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json",
-      initial_session_file="tests/integration/fixture/trip_planner_agent/test_files/initial.session.json",
      agent_name="identify_agent",
      num_runs=4,
  )
--- a/tests/integration/test_with_test_file.py
+++ b/tests/integration/test_with_test_file.py
@ -19,7 +19,7 @@ import pytest
@pytest.mark.asyncio
 async def test_with_single_test_file():
  """Test the agent's basic ability via session file."""
-  AgentEvaluator.evaluate(
+  await AgentEvaluator.evaluate(
      agent_module="tests.integration.fixture.home_automation_agent",
      eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/simple_test.test.json",
  )
@ -28,7 +28,7 @@ async def test_with_single_test_file():
@pytest.mark.asyncio
 async def test_with_folder_of_test_files_long_running():
  """Test the agent's basic ability via a folder of session files."""
-  AgentEvaluator.evaluate(
+  await AgentEvaluator.evaluate(
      agent_module="tests.integration.fixture.home_automation_agent",
      eval_dataset_file_path_or_dir=(
          "tests/integration/fixture/home_automation_agent/test_files"