diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py
index 734ee2e..a29af99 100644
--- a/src/google/adk/evaluation/agent_evaluator.py
+++ b/src/google/adk/evaluation/agent_evaluator.py
@@ -13,12 +13,18 @@
 # limitations under the License.
 
 import json
+import logging
 import os
 from os import path
+from typing import Any
 from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Union
 import uuid
+
+from pydantic import ValidationError
+
 from .eval_set import EvalSet
 from .evaluation_generator import EvaluationGenerator
 from .evaluator import EvalStatus
@@ -28,6 +34,9 @@ from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
 from .response_evaluator import ResponseEvaluator
 from .trajectory_evaluator import TrajectoryEvaluator
 
+logger = logging.getLogger(__name__)
+
+
 # Constants for default runs and evaluation criteria
 NUM_RUNS = 2
 TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
@@ -131,18 +140,17 @@ class AgentEvaluator:
         )
 
         assert evaluation_result.overall_eval_status == EvalStatus.PASSED, (
-            f"`{eval_case_responses.eval_case.eval_id}`: "
             f"{metric_name} for {agent_module} Failed. Expected {threshold},"
             f" but got {evaluation_result.overall_score}."
         )
 
   @staticmethod
   async def evaluate(
-      agent_module,
-      eval_dataset_file_path_or_dir,
-      num_runs=NUM_RUNS,
-      agent_name=None,
-      initial_session_file=None,
+      agent_module: str,
+      eval_dataset_file_path_or_dir: str,
+      num_runs: int = NUM_RUNS,
+      agent_name: Optional[str] = None,
+      initial_session_file: Optional[str] = None,
   ):
     """Evaluates an Agent given eval data.
 
@@ -170,25 +178,14 @@ class AgentEvaluator:
     else:
       test_files = [eval_dataset_file_path_or_dir]
 
-    initial_session = {}
-    if initial_session_file:
-      with open(initial_session_file, "r") as f:
-        initial_session = json.loads(f.read())
+    initial_session = AgentEvaluator._get_initial_session(initial_session_file)
 
     for test_file in test_files:
-      data = AgentEvaluator._load_dataset(test_file)[0]
       criteria = AgentEvaluator.find_config_for_test_file(test_file)
-      AgentEvaluator._validate_input([data], criteria)
-
-      eval_data = {
-          "name": test_file,
-          "data": data,
-          "initial_session": initial_session,
-      }
-
-      eval_set = convert_eval_set_to_pydanctic_schema(
-          eval_set_id=str(uuid.uuid4()), eval_set_in_json_format=[eval_data]
+      eval_set = AgentEvaluator._load_eval_set_from_file(
+          test_file, criteria, initial_session
       )
+
       await AgentEvaluator.evaluate_eval_set(
           agent_module=agent_module,
           eval_set=eval_set,
@@ -197,6 +194,86 @@ class AgentEvaluator:
           agent_name=agent_name,
       )
 
+  @staticmethod
+  def migrate_eval_data_to_new_schema(
+      old_eval_data_file: str,
+      new_eval_data_file: str,
+      initial_session_file: Optional[str] = None,
+  ):
+    """A utility for migrating eval data to new schema backed by EvalSet."""
+    if not old_eval_data_file or not new_eval_data_file:
+      raise ValueError(
+          "One of old_eval_data_file or new_eval_data_file is empty."
+      )
+
+    criteria = AgentEvaluator.find_config_for_test_file(old_eval_data_file)
+    initial_session = AgentEvaluator._get_initial_session(initial_session_file)
+
+    eval_set = AgentEvaluator._get_eval_set_from_old_format(
+        old_eval_data_file, criteria, initial_session
+    )
+
+    with open(new_eval_data_file, "w") as f:
+      f.write(eval_set.model_dump_json(indent=2))
+
+  @staticmethod
+  def _load_eval_set_from_file(
+      eval_set_file: str,
+      criteria: dict[str, float],
+      initial_session: dict[str, Any],
+  ) -> EvalSet:
+    """Loads an EvalSet from the given file."""
+    if os.path.isfile(eval_set_file):
+      with open(eval_set_file, "r", encoding="utf-8") as f:
+        content = f.read()
+
+      try:
+        eval_set = EvalSet.model_validate_json(content)
+        assert len(initial_session) == 0, (
+            "Intial session should be specified as a part of EvalSet file."
+            " Explicit initial session is only needed, when specifying data in"
+            " the older schema."
+        )
+        return eval_set
+      except ValidationError:
+        # We assume that the eval data was specified in the old format
+        logger.warning(
+            f"Contents of {eval_set_file} appear to be in older format.To avoid"
+            " this warning, please update your test files to contain data in"
+            " EvalSet schema. You can use `migrate_eval_data_to_new_schema`"
+            " for migrating your old test files."
+        )
+
+    # If we are here, the data must be specified in the older format.
+    return AgentEvaluator._get_eval_set_from_old_format(
+        eval_set_file, criteria, initial_session
+    )
+
+  @staticmethod
+  def _get_eval_set_from_old_format(
+      eval_set_file: str,
+      criteria: dict[str, float],
+      initial_session: dict[str, Any],
+  ) -> EvalSet:
+    data = AgentEvaluator._load_dataset(eval_set_file)[0]
+    AgentEvaluator._validate_input([data], criteria)
+    eval_data = {
+        "name": eval_set_file,
+        "data": data,
+        "initial_session": initial_session,
+    }
+    return convert_eval_set_to_pydanctic_schema(
+        eval_set_id=str(uuid.uuid4()), eval_set_in_json_format=[eval_data]
+    )
+
+  @staticmethod
+  def _get_initial_session(initial_session_file: Optional[str] = None):
+    initial_session = {}
+    if initial_session_file:
+      with open(initial_session_file, "r") as f:
+        initial_session = json.loads(f.read())
+    return initial_session
+
   @staticmethod
   def _load_dataset(
       input_data: Union[str, List[str], List[Dict], List[List[Dict]]],
diff --git a/src/google/adk/evaluation/local_eval_sets_manager.py b/src/google/adk/evaluation/local_eval_sets_manager.py
index 55f95a5..456893f 100644
--- a/src/google/adk/evaluation/local_eval_sets_manager.py
+++ b/src/google/adk/evaluation/local_eval_sets_manager.py
@@ -135,7 +135,10 @@ def convert_eval_set_to_pydanctic_schema(
       )
 
     session_input = None
-    if "initial_session" in old_eval_case:
+    if (
+        "initial_session" in old_eval_case
+        and len(old_eval_case["initial_session"]) > 0
+    ):
       session_input = SessionInput(
           app_name=old_eval_case["initial_session"].get("app_name", ""),
           user_id=old_eval_case["initial_session"].get("user_id", ""),
diff --git a/tests/integration/fixture/ecommerce_customer_service_agent/order_query.test.json b/tests/integration/fixture/ecommerce_customer_service_agent/order_query.test.json
index ac424f3..6c215ad 100644
--- a/tests/integration/fixture/ecommerce_customer_service_agent/order_query.test.json
+++ b/tests/integration/fixture/ecommerce_customer_service_agent/order_query.test.json
@@ -1,69 +1,229 @@
-[
-  {
-    "query": "Send an email to user user_a whose email address is alice@example.com",
-    "expected_tool_use": [
-      {
-        "tool_name": "send_email",
-        "tool_input": {
-          "email": "alice@example.com",
-          "user_id": "user_a"
+{
+  "eval_set_id": "a1157c01-851f-48a8-b956-83cf7f463510",
+  "name": "a1157c01-851f-48a8-b956-83cf7f463510",
+  "description": null,
+  "eval_cases": [
+    {
+      "eval_id": "tests/integration/fixture/ecommerce_customer_service_agent/order_query.test.json",
+      "conversation": [
+        {
+          "invocation_id": "38d54523-d789-4873-8cc0-d38826c7feb4",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Send an email to user user_a whose email address is alice@example.com"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Email sent to alice@example.com for user id user_a."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "email": "alice@example.com",
+                  "user_id": "user_a"
+                },
+                "name": "send_email"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747341706.6240807
+        },
+        {
+          "invocation_id": "916393ab-0bce-4cb0-98de-6573d4e8e25c",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Can you tell me the status of my order with ID 1?"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Your order with ID 1 is FINISHED."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "order_id": "1"
+                },
+                "name": "get_order_status"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747341706.6241167
+        },
+        {
+          "invocation_id": "511b23d9-56f9-423b-9c31-7626f3411c32",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Cancel all pending order for the user with user id user_a"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I have checked your orders and order 4 was in pending status, so I have cancelled it. Order 1 was already finished and couldn't be cancelled.\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "user_id": "user_a"
+                },
+                "name": "get_order_ids_for_user"
+              },
+              {
+                "id": null,
+                "args": {
+                  "order_id": "1"
+                },
+                "name": "get_order_status"
+              },
+              {
+                "id": null,
+                "args": {
+                  "order_id": "4"
+                },
+                "name": "get_order_status"
+              },
+              {
+                "id": null,
+                "args": {
+                  "order_id": "4"
+                },
+                "name": "cancel_order"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747341706.6241703
+        },
+        {
+          "invocation_id": "dcdf4b6d-96dd-4602-8c14-0563c6f6b5d0",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "What orders have I placed under the username user_b?"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "User user_b has placed one order with order ID 2.\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "user_id": "user_b"
+                },
+                "name": "get_order_ids_for_user"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747341706.624196
         }
-      }
-    ],
-    "reference": "Email sent to alice@example.com for user id user_a."
-  },
-  {
-    "query": "Can you tell me the status of my order with ID 1?",
-    "expected_tool_use": [
-      {
-        "tool_name": "get_order_status",
-        "tool_input": {
-          "order_id": "1"
-        }
-      }
-    ],
-    "reference": "Your order with ID 1 is FINISHED."
-  },
-  {
-    "query": "Cancel all pending order for the user with user id user_a",
-    "expected_tool_use": [
-      {
-        "tool_name": "get_order_ids_for_user",
-        "tool_input": {
-          "user_id": "user_a"
-        }
-      },
-      {
-        "tool_name": "get_order_status",
-        "tool_input": {
-          "order_id": "1"
-        }
-      },
-      {
-        "tool_name": "get_order_status",
-        "tool_input": {
-          "order_id": "4"
-        }
-      },
-      {
-        "tool_name": "cancel_order",
-        "tool_input": {
-          "order_id": "4"
-        }
-      }
-    ],
-    "reference": "I have checked your orders and order 4 was in pending status, so I have cancelled it. Order 1 was already finished and couldn't be cancelled.\n"
-  },
-  {
-    "query": "What orders have I placed under the username user_b?",
-    "expected_tool_use": [
-      {
-        "tool_name": "get_order_ids_for_user",
-        "tool_input": {
-          "user_id": "user_b"
-        }
-      }
-    ],
-    "reference": "User user_b has placed one order with order ID 2.\n"
-  }
-]
+      ],
+      "session_input": null,
+      "creation_timestamp": 1747341706.6242023
+    }
+  ],
+  "creation_timestamp": 1747341706.6242158
+}
\ No newline at end of file
diff --git a/tests/integration/fixture/hello_world_agent/roll_die.test.json b/tests/integration/fixture/hello_world_agent/roll_die.test.json
index fdc8127..7c1e453 100644
--- a/tests/integration/fixture/hello_world_agent/roll_die.test.json
+++ b/tests/integration/fixture/hello_world_agent/roll_die.test.json
@@ -1,24 +1,143 @@
-[
-  {
-    "query": "Hi who are you?",
-    "expected_tool_use": [],
-    "reference": "I am a data processing agent. I can roll dice and check if the results are prime numbers. What would you like me to do? \n"
-  },
-  {
-    "query": "What can you do?",
-    "expected_tool_use": [],
-    "reference": "I can roll dice for you of different sizes, and I can check if the results are prime numbers.  I can also remember previous rolls if you'd like to check those for primes as well.  What would you like me to do? \n"
-  },
-  {
-    "query": "Can you roll a die with 6 sides",
-    "expected_tool_use": [
-      {
-        "tool_name": "roll_die",
-        "tool_input": {
-          "sides": 6
+{
+  "eval_set_id": "56540925-a5ff-49fe-a4e1-589fe78066f2",
+  "name": "56540925-a5ff-49fe-a4e1-589fe78066f2",
+  "description": null,
+  "eval_cases": [
+    {
+      "eval_id": "tests/integration/fixture/hello_world_agent/roll_die.test.json",
+      "conversation": [
+        {
+          "invocation_id": "b01f67f0-9f23-44d6-bbe4-36ea235cb9fb",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Hi who are you?"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I am a data processing agent. I can roll dice and check if the results are prime numbers. What would you like me to do? \n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747341775.8937013
+        },
+        {
+          "invocation_id": "13be0093-ac29-4828-98c6-5bbd570c010c",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "What can you do?"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I can roll dice for you of different sizes, and I can check if the results are prime numbers.  I can also remember previous rolls if you'd like to check those for primes as well.  What would you like me to do? \n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747341775.8937378
+        },
+        {
+          "invocation_id": "7deda353-c936-4c21-b242-9fa75e45b6a7",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Can you roll a die with 6 sides"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": null
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "sides": 6
+                },
+                "name": "roll_die"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747341775.8937788
         }
-      }
-    ],
-    "reference": null
-  }
-]
+      ],
+      "session_input": null,
+      "creation_timestamp": 1747341775.8937826
+    }
+  ],
+  "creation_timestamp": 1747341775.8937957
+}
\ No newline at end of file
diff --git a/tests/integration/fixture/home_automation_agent/simple_test.test.json b/tests/integration/fixture/home_automation_agent/simple_test.test.json
index 978c36f..8e055dd 100644
--- a/tests/integration/fixture/home_automation_agent/simple_test.test.json
+++ b/tests/integration/fixture/home_automation_agent/simple_test.test.json
@@ -1,5 +1,65 @@
-[{
-  "query": "Turn off device_2 in the Bedroom.",
-  "expected_tool_use": [{"tool_name": "set_device_info", "tool_input": {"location": "Bedroom", "device_id": "device_2", "status": "OFF"}}],
-  "reference": "I have set the device_2 status to off."
-}]
+{
+  "eval_set_id": "b305bd06-38c5-4796-b9c7-d9c7454338b9",
+  "name": "b305bd06-38c5-4796-b9c7-d9c7454338b9",
+  "description": null,
+  "eval_cases": [
+    {
+      "eval_id": "tests/integration/fixture/home_automation_agent/simple_test.test.json",
+      "conversation": [
+        {
+          "invocation_id": "b7982664-0ab6-47cc-ab13-326656afdf75",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Turn off device_2 in the Bedroom."
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I have set the device_2 status to off."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "location": "Bedroom",
+                  "device_id": "device_2",
+                  "status": "OFF"
+                },
+                "name": "set_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747337309.2360144
+        }
+      ],
+      "session_input": null,
+      "creation_timestamp": 1747337309.2360282
+    }
+  ],
+  "creation_timestamp": 1747337309.2360387
+}
\ No newline at end of file
diff --git a/tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json b/tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json
index 0633eab..243c1dc 100644
--- a/tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json
+++ b/tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json
@@ -1,18 +1,113 @@
-[
-  {
-      "query": "Turn off device_2 in the Bedroom.",
-      "expected_tool_use": [{
-          "tool_name": "set_device_info",
-          "tool_input": {"location": "Bedroom", "status": "OFF", "device_id": "device_2"}
-      }],
-      "reference": "I have set the device 2 status to off."
-  },
-  {
-      "query": "What's the status of device_2 in the Bedroom?",
-      "expected_tool_use": [{
-          "tool_name": "get_device_info",
-          "tool_input": {"device_id": "device_2"}
-      }],
-      "reference": "Status of device_2 is off."
-  }
-]
+{
+  "eval_set_id": "1be50511-ff75-4d68-b2d7-2165cbdc1044",
+  "name": "1be50511-ff75-4d68-b2d7-2165cbdc1044",
+  "description": null,
+  "eval_cases": [
+    {
+      "eval_id": "tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json",
+      "conversation": [
+        {
+          "invocation_id": "cbece1c0-3811-45c0-96fc-9a4279075483",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Turn off device_2 in the Bedroom."
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I have set the device 2 status to off."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "location": "Bedroom",
+                  "status": "OFF",
+                  "device_id": "device_2"
+                },
+                "name": "set_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340826.1082227
+        },
+        {
+          "invocation_id": "cc85cdae-4258-4b94-8fe7-a985b8356190",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "What's the status of device_2 in the Bedroom?"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Status of device_2 is off."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "device_id": "device_2"
+                },
+                "name": "get_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340826.1082554
+        }
+      ],
+      "session_input": null,
+      "creation_timestamp": 1747340826.108262
+    }
+  ],
+  "creation_timestamp": 1747340826.108275
+}
\ No newline at end of file
diff --git a/tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json b/tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json
index 0e5778b..612f3cd 100644
--- a/tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json
+++ b/tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json
@@ -1,17 +1,105 @@
-[
-  {
-      "query": "Turn off device_2 in the Bedroom.",
-      "expected_tool_use": [
-          {
-              "tool_name": "set_device_info",
-              "tool_input": {"location": "Bedroom", "device_id": "device_2", "status": "OFF"}
-          }
+{
+  "eval_set_id": "94553685-5f19-492b-bc44-f3bc775955e9",
+  "name": "94553685-5f19-492b-bc44-f3bc775955e9",
+  "description": null,
+  "eval_cases": [
+    {
+      "eval_id": "tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json",
+      "conversation": [
+        {
+          "invocation_id": "a958b622-21d3-4a6c-9c15-1274bbb8a6b6",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Turn off device_2 in the Bedroom."
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "OK. I've turned off device_2 in the Bedroom. Anything else?\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "location": "Bedroom",
+                  "device_id": "device_2",
+                  "status": "OFF"
+                },
+                "name": "set_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340865.7043095
+        },
+        {
+          "invocation_id": "1c07123d-4bed-4eb0-9e55-c7f80c70dadf",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "What's the command I just issued?"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "You asked me to turn off device_2 in the Bedroom.\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340865.7043421
+        }
       ],
-      "reference": "OK. I've turned off device_2 in the Bedroom. Anything else?\n"
-  },
-  {
-      "query": "What's the command I just issued?",
-      "expected_tool_use": [],
-      "reference": "You asked me to turn off device_2 in the Bedroom.\n"
-  }
-]
+      "session_input": null,
+      "creation_timestamp": 1747340865.7043483
+    }
+  ],
+  "creation_timestamp": 1747340865.704361
+}
\ No newline at end of file
diff --git a/tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json b/tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json
index 334dd2d..dfe2b15 100644
--- a/tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json
+++ b/tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json
@@ -1,18 +1,115 @@
-[
+{
+  "eval_set_id": "4412cca6-dfcd-43ab-bbc5-9155380c7137",
+  "name": "4412cca6-dfcd-43ab-bbc5-9155380c7137",
+  "description": null,
+  "eval_cases": [
     {
-        "query": "Turn off device_2 in the Bedroom.",
-        "expected_tool_use": [{
-            "tool_name": "set_device_info",
-            "tool_input": {"location": "Bedroom", "device_id": "device_2", "status": "OFF"}
-        }],
-        "reference": "I have set the device 2 status to off."
-    },
-    {
-        "query": "Turn on device_2 in the Bedroom.",
-        "expected_tool_use": [{
-            "tool_name": "set_device_info",
-            "tool_input": {"location": "Bedroom", "status": "ON", "device_id": "device_2"}
-        }],
-        "reference": "I have set the device 2 status to on."
+      "eval_id": "tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json",
+      "conversation": [
+        {
+          "invocation_id": "9f51a1ac-56a4-4b4a-9878-36ff1ae312ce",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Turn off device_2 in the Bedroom."
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I have set the device 2 status to off."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "location": "Bedroom",
+                  "device_id": "device_2",
+                  "status": "OFF"
+                },
+                "name": "set_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340791.7353904
+        },
+        {
+          "invocation_id": "c82d54d0-5fa8-4f79-a6dc-692090f0d42b",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Turn on device_2 in the Bedroom."
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I have set the device 2 status to on."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "location": "Bedroom",
+                  "status": "ON",
+                  "device_id": "device_2"
+                },
+                "name": "set_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340791.7354295
+        }
+      ],
+      "session_input": null,
+      "creation_timestamp": 1747340791.7354348
     }
-]
+  ],
+  "creation_timestamp": 1747340791.735446
+}
\ No newline at end of file
diff --git a/tests/integration/fixture/home_automation_agent/test_files/simple_test.test.json b/tests/integration/fixture/home_automation_agent/test_files/simple_test.test.json
index 0e5778b..b324a11 100644
--- a/tests/integration/fixture/home_automation_agent/test_files/simple_test.test.json
+++ b/tests/integration/fixture/home_automation_agent/test_files/simple_test.test.json
@@ -1,17 +1,105 @@
-[
-  {
-      "query": "Turn off device_2 in the Bedroom.",
-      "expected_tool_use": [
-          {
-              "tool_name": "set_device_info",
-              "tool_input": {"location": "Bedroom", "device_id": "device_2", "status": "OFF"}
-          }
+{
+  "eval_set_id": "9100bfc9-cc28-4ab9-b920-2dc72e138997",
+  "name": "9100bfc9-cc28-4ab9-b920-2dc72e138997",
+  "description": null,
+  "eval_cases": [
+    {
+      "eval_id": "tests/integration/fixture/home_automation_agent/test_files/simple_test.test.json",
+      "conversation": [
+        {
+          "invocation_id": "9f5e8d91-8e51-41d6-addf-196a828168c5",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Turn off device_2 in the Bedroom."
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "OK. I've turned off device_2 in the Bedroom. Anything else?\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "location": "Bedroom",
+                  "device_id": "device_2",
+                  "status": "OFF"
+                },
+                "name": "set_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340849.0429707
+        },
+        {
+          "invocation_id": "767b2451-5f7b-4c73-aeaf-a82c71e15788",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "What's the command I just issued?"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "You asked me to turn off device_2 in the Bedroom.\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340849.0429986
+        }
       ],
-      "reference": "OK. I've turned off device_2 in the Bedroom. Anything else?\n"
-  },
-  {
-      "query": "What's the command I just issued?",
-      "expected_tool_use": [],
-      "reference": "You asked me to turn off device_2 in the Bedroom.\n"
-  }
-]
+      "session_input": null,
+      "creation_timestamp": 1747340849.0430045
+    }
+  ],
+  "creation_timestamp": 1747340849.0430162
+}
\ No newline at end of file
diff --git a/tests/integration/fixture/home_automation_agent/test_files/simple_test2.test.json b/tests/integration/fixture/home_automation_agent/test_files/simple_test2.test.json
index 5ba5d82..6efb313 100644
--- a/tests/integration/fixture/home_automation_agent/test_files/simple_test2.test.json
+++ b/tests/integration/fixture/home_automation_agent/test_files/simple_test2.test.json
@@ -1,5 +1,65 @@
-[{
-  "query": "Turn off device_3 in the Bedroom.",
-  "expected_tool_use": [{"tool_name": "set_device_info", "tool_input": {"location": "Bedroom", "device_id": "device_3", "status": "OFF"}}],
-  "reference": "I have set the device_3 status to off."
-}]
+{
+  "eval_set_id": "e141f90b-9e7e-4f06-94d7-bbe7e8080ead",
+  "name": "e141f90b-9e7e-4f06-94d7-bbe7e8080ead",
+  "description": null,
+  "eval_cases": [
+    {
+      "eval_id": "tests/integration/fixture/home_automation_agent/test_files/simple_test2.test.json",
+      "conversation": [
+        {
+          "invocation_id": "c35582f7-838a-460f-b783-039e278165e0",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Turn off device_3 in the Bedroom."
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "I have set the device_3 status to off."
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "location": "Bedroom",
+                  "device_id": "device_3",
+                  "status": "OFF"
+                },
+                "name": "set_device_info"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747340814.8645504
+        }
+      ],
+      "session_input": null,
+      "creation_timestamp": 1747340814.86456
+    }
+  ],
+  "creation_timestamp": 1747340814.864572
+}
\ No newline at end of file
diff --git a/tests/integration/fixture/trip_planner_agent/test_files/initial.session.json b/tests/integration/fixture/trip_planner_agent/test_files/initial.session.json
deleted file mode 100644
index b33840c..0000000
--- a/tests/integration/fixture/trip_planner_agent/test_files/initial.session.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "id": "test_id",
-  "app_name": "trip_planner_agent",
-  "user_id": "test_user",
-  "state": {
-    "origin": "San Francisco",
-    "interests": "Food, Shopping, Museums",
-    "range": "1000 miles",
-    "cities": ""
-  },
-  "events": [],
-  "last_update_time": 1741218714.258285
-}
diff --git a/tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json b/tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json
index 03f52ab..9fe7c6a 100644
--- a/tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json
+++ b/tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json
@@ -1,7 +1,64 @@
-[
-  {
-    "query": "Based on my interests, where should I go, Yosemite national park or Los Angeles?",
-    "expected_tool_use": [],
-    "reference": "Given your interests in food, shopping, and museums, Los Angeles would be a better choice than Yosemite National Park. Yosemite is primarily focused on outdoor activities and natural landscapes, while Los Angeles offers a diverse range of culinary experiences, shopping districts, and world-class museums. I will now gather information to create an in-depth guide for your trip to Los Angeles.\n"
-  }
-]
+{
+  "eval_set_id": "189d6856-9b90-4b9c-bda8-7cec899507ae",
+  "name": "189d6856-9b90-4b9c-bda8-7cec899507ae",
+  "description": null,
+  "eval_cases": [
+    {
+      "eval_id": "tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json",
+      "conversation": [
+        {
+          "invocation_id": "1c2e8003-d19c-4912-b0ae-17b9d568f8fb",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Based on my interests, where should I go, Yosemite national park or Los Angeles?"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "file_data": null,
+                "function_call": null,
+                "function_response": null,
+                "inline_data": null,
+                "text": "Given your interests in food, shopping, and museums, Los Angeles would be a better choice than Yosemite National Park. Yosemite is primarily focused on outdoor activities and natural landscapes, while Los Angeles offers a diverse range of culinary experiences, shopping districts, and world-class museums. I will now gather information to create an in-depth guide for your trip to Los Angeles.\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1747339378.484014
+        }
+      ],
+      "session_input": {
+        "app_name": "trip_planner_agent",
+        "user_id": "test_user",
+        "state": {
+          "origin": "San Francisco",
+          "interests": "Food, Shopping, Museums",
+          "range": "1000 miles",
+          "cities": ""
+        }
+      },
+      "creation_timestamp": 1747339378.484044
+    }
+  ],
+  "creation_timestamp": 1747339378.484056
+}
\ No newline at end of file
diff --git a/tests/integration/test_evalute_agent_in_fixture.py b/tests/integration/test_evalute_agent_in_fixture.py
index 8f9b77f..4fdeed9 100644
--- a/tests/integration/test_evalute_agent_in_fixture.py
+++ b/tests/integration/test_evalute_agent_in_fixture.py
@@ -32,15 +32,9 @@ def agent_eval_artifacts_in_fixture():
       # Evaluation test files end with test.json
       if not filename.endswith('test.json'):
         continue
-      initial_session_file = (
-          f'tests/integration/fixture/{agent_name}/initial.session.json'
-      )
       agent_eval_artifacts.append((
           f'tests.integration.fixture.{agent_name}',
           f'tests/integration/fixture/{agent_name}/{filename}',
-          initial_session_file
-          if os.path.exists(initial_session_file)
-          else None,
       ))
 
   # This method gets invoked twice, sorting helps ensure that both the
@@ -53,12 +47,12 @@ def agent_eval_artifacts_in_fixture():
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    'agent_name, evalfile, initial_session_file',
+    'agent_name, evalfile',
     agent_eval_artifacts_in_fixture(),
-    ids=[agent_name for agent_name, _, _ in agent_eval_artifacts_in_fixture()],
+    ids=[agent_name for agent_name, _ in agent_eval_artifacts_in_fixture()],
 )
 async def test_evaluate_agents_long_running_4_runs_per_eval_item(
-    agent_name, evalfile, initial_session_file
+    agent_name, evalfile
 ):
   """Test agents evaluation in fixture folder.
 
@@ -70,7 +64,6 @@ async def test_evaluate_agents_long_running_4_runs_per_eval_item(
   await AgentEvaluator.evaluate(
       agent_module=agent_name,
       eval_dataset_file_path_or_dir=evalfile,
-      initial_session_file=initial_session_file,
       # Using a slightly higher value helps us manange the variances that may
       # happen in each eval.
       # This, of course, comes at a cost of incrased test run times.
diff --git a/tests/integration/test_multi_agent.py b/tests/integration/test_multi_agent.py
index 2d0ccab..3d161a9 100644
--- a/tests/integration/test_multi_agent.py
+++ b/tests/integration/test_multi_agent.py
@@ -18,13 +18,10 @@ import pytest
 
 @pytest.mark.asyncio
 async def test_eval_agent():
-  AgentEvaluator.evaluate(
+  await AgentEvaluator.evaluate(
       agent_module="tests.integration.fixture.trip_planner_agent",
       eval_dataset_file_path_or_dir=(
           "tests/integration/fixture/trip_planner_agent/trip_inquiry.test.json"
       ),
-      initial_session_file=(
-          "tests/integration/fixture/trip_planner_agent/initial.session.json"
-      ),
       num_runs=4,
   )
diff --git a/tests/integration/test_multi_turn.py b/tests/integration/test_multi_turn.py
index 82ca746..5e300a7 100644
--- a/tests/integration/test_multi_turn.py
+++ b/tests/integration/test_multi_turn.py
@@ -19,7 +19,7 @@ import pytest
 @pytest.mark.asyncio
 async def test_simple_multi_turn_conversation():
   """Test a simple multi-turn conversation."""
-  AgentEvaluator.evaluate(
+  await AgentEvaluator.evaluate(
       agent_module="tests.integration.fixture.home_automation_agent",
       eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json",
       num_runs=4,
@@ -29,7 +29,7 @@ async def test_simple_multi_turn_conversation():
 @pytest.mark.asyncio
 async def test_dependent_tool_calls():
   """Test subsequent tool calls that are dependent on previous tool calls."""
-  AgentEvaluator.evaluate(
+  await AgentEvaluator.evaluate(
       agent_module="tests.integration.fixture.home_automation_agent",
       eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json",
       num_runs=4,
@@ -39,8 +39,7 @@ async def test_dependent_tool_calls():
 @pytest.mark.asyncio
 async def test_memorizing_past_events():
   """Test memorizing past events."""
-
-  AgentEvaluator.evaluate(
+  await AgentEvaluator.evaluate(
       agent_module="tests.integration.fixture.home_automation_agent",
       eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json",
       num_runs=4,
diff --git a/tests/integration/test_sub_agent.py b/tests/integration/test_sub_agent.py
index b3d89b3..cbfb90b 100644
--- a/tests/integration/test_sub_agent.py
+++ b/tests/integration/test_sub_agent.py
@@ -19,10 +19,9 @@ import pytest
 @pytest.mark.asyncio
 async def test_eval_agent():
   """Test hotel sub agent in a multi-agent system."""
-  AgentEvaluator.evaluate(
+  await AgentEvaluator.evaluate(
       agent_module="tests.integration.fixture.trip_planner_agent",
       eval_dataset_file_path_or_dir="tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json",
-      initial_session_file="tests/integration/fixture/trip_planner_agent/test_files/initial.session.json",
       agent_name="identify_agent",
       num_runs=4,
   )
diff --git a/tests/integration/test_with_test_file.py b/tests/integration/test_with_test_file.py
index 664bdbf..d19428f 100644
--- a/tests/integration/test_with_test_file.py
+++ b/tests/integration/test_with_test_file.py
@@ -19,7 +19,7 @@ import pytest
 @pytest.mark.asyncio
 async def test_with_single_test_file():
   """Test the agent's basic ability via session file."""
-  AgentEvaluator.evaluate(
+  await AgentEvaluator.evaluate(
       agent_module="tests.integration.fixture.home_automation_agent",
       eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/simple_test.test.json",
   )
@@ -28,7 +28,7 @@ async def test_with_single_test_file():
 @pytest.mark.asyncio
 async def test_with_folder_of_test_files_long_running():
   """Test the agent's basic ability via a folder of session files."""
-  AgentEvaluator.evaluate(
+  await AgentEvaluator.evaluate(
       agent_module="tests.integration.fixture.home_automation_agent",
       eval_dataset_file_path_or_dir=(
           "tests/integration/fixture/home_automation_agent/test_files"