# Copyright 2025 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import annotations import json import os import uuid from google.adk.errors.not_found_error import NotFoundError from google.adk.evaluation.eval_case import EvalCase from google.adk.evaluation.eval_case import IntermediateData from google.adk.evaluation.eval_case import Invocation from google.adk.evaluation.eval_set import EvalSet from google.adk.evaluation.local_eval_sets_manager import _EVAL_SET_FILE_EXTENSION from google.adk.evaluation.local_eval_sets_manager import convert_eval_set_to_pydanctic_schema from google.adk.evaluation.local_eval_sets_manager import load_eval_set_from_file from google.adk.evaluation.local_eval_sets_manager import LocalEvalSetsManager from google.genai import types as genai_types from pydantic import ValidationError import pytest class TestConvertEvalSetToPydancticSchema: """Tests convert_eval_set_to_pydanctic_schema method.""" def test_convert_eval_set_to_pydanctic_schema_complete(self): eval_set_id = "test_eval_set" eval_set_in_json_format = [{ "name": "roll_17_sided_dice_twice", "data": [ { "query": "What can you do?", "expected_tool_use": [], "expected_intermediate_agent_responses": [], "reference": ( "I can roll dice of different sizes and check if a number" " is prime. I can also use multiple tools in parallel.\n" ), }, { "query": "Roll a 17 sided dice twice for me", "expected_tool_use": [ {"tool_name": "roll_die", "tool_input": {"sides": 17}}, {"tool_name": "roll_die", "tool_input": {"sides": 17}}, ], "expected_intermediate_agent_responses": [ {"author": "agent1", "text": "thought1"} ], "reference": ( "I have rolled a 17 sided die twice. The first roll was 13" " and the second roll was 4.\n" ), }, ], "initial_session": { "state": {}, "app_name": "hello_world", "user_id": "user", }, }] eval_set = convert_eval_set_to_pydanctic_schema( eval_set_id, eval_set_in_json_format ) assert eval_set.eval_set_id == eval_set_id assert len(eval_set.eval_cases) == 1 assert eval_set.eval_cases[0].eval_id == "roll_17_sided_dice_twice" assert len(eval_set.eval_cases[0].conversation) == 2 assert eval_set.eval_cases[0].session_input.app_name == "hello_world" assert ( len(eval_set.eval_cases[0].conversation[1].intermediate_data.tool_uses) == 2 ) assert ( len( eval_set.eval_cases[0] .conversation[1] .intermediate_data.intermediate_responses ) == 1 ) def test_convert_eval_set_to_pydanctic_schema_minimal(self): eval_set_id = "test_eval_set" eval_set_in_json_format = [{ "name": "minimal_case", "data": [{"query": "Hello", "reference": "World"}], }] eval_set = convert_eval_set_to_pydanctic_schema( eval_set_id, eval_set_in_json_format ) assert eval_set.eval_set_id == eval_set_id assert len(eval_set.eval_cases) == 1 assert eval_set.eval_cases[0].eval_id == "minimal_case" assert len(eval_set.eval_cases[0].conversation) == 1 assert ( eval_set.eval_cases[0].conversation[0].user_content.parts[0].text == "Hello" ) assert ( eval_set.eval_cases[0].conversation[0].final_response.parts[0].text == "World" ) def test_convert_eval_set_to_pydanctic_schema_empty_tool_use_and_intermediate_responses( self, ): eval_set_id = "test_eval_set" eval_set_in_json_format = [{ "name": "empty_lists", "data": [{ "query": "Test", "reference": "Test Ref", "expected_tool_use": [], "expected_intermediate_agent_responses": [], }], }] eval_set = convert_eval_set_to_pydanctic_schema( eval_set_id, eval_set_in_json_format ) assert eval_set.eval_set_id == eval_set_id assert len(eval_set.eval_cases) == 1 assert ( len(eval_set.eval_cases[0].conversation[0].intermediate_data.tool_uses) == 0 ) assert ( len( eval_set.eval_cases[0] .conversation[0] .intermediate_data.intermediate_responses ) == 0 ) def test_convert_eval_set_to_pydanctic_schema_empty_initial_session(self): eval_set_id = "test_eval_set" eval_set_in_json_format = [{ "name": "empty_session", "data": [{"query": "Test", "reference": "Test Ref"}], "initial_session": {}, }] eval_set = convert_eval_set_to_pydanctic_schema( eval_set_id, eval_set_in_json_format ) assert eval_set.eval_set_id == eval_set_id assert eval_set.eval_cases[0].session_input is None def test_convert_eval_set_to_pydanctic_schema_invalid_data(self): # This test implicitly checks for potential validation errors during Pydantic # object creation eval_set_id = "test_eval_set" eval_set_in_json_format = [{ "name": 123, # Invalid name type "data": [{ "query": 456, # Invalid query type "reference": 789, # Invalid reference type "expected_tool_use": [{ "tool_name": 123, "tool_input": 456, }], # Invalid tool name and input "expected_intermediate_agent_responses": [ {"author": 123, "text": 456} # Invalid author and text ], }], "initial_session": { "state": "invalid", # Invalid state type "app_name": 123, # Invalid app_name type "user_id": 456, # Invalid user_id type }, }] with pytest.raises(ValidationError): convert_eval_set_to_pydanctic_schema(eval_set_id, eval_set_in_json_format) class TestLoadEvalSetFromFile: """Tests for load_eval_set_from_file method.""" def test_load_eval_set_from_file_new_format(self, tmp_path): # Create a dummy file with EvalSet in the new Pydantic JSON format eval_set = EvalSet( eval_set_id="new_format_eval_set", eval_cases=[ EvalCase( eval_id="new_format_case", conversation=[ Invocation( invocation_id=str(uuid.uuid4()), user_content=genai_types.Content( parts=[genai_types.Part(text="New Format Query")] ), final_response=genai_types.Content( parts=[ genai_types.Part(text="New Format Reference") ] ), ) ], ) ], ) file_path = tmp_path / "new_format.json" with open(file_path, "w", encoding="utf-8") as f: f.write(eval_set.model_dump_json()) loaded_eval_set = load_eval_set_from_file( str(file_path), "new_format_eval_set" ) assert loaded_eval_set == eval_set def test_load_eval_set_from_file_old_format(self, tmp_path, mocker): mocked_time = 12345678 mocked_invocation_id = "15061953" mocker.patch("time.time", return_value=mocked_time) mocker.patch("uuid.uuid4", return_value=mocked_invocation_id) # Create a dummy file with EvalSet in the old JSON format old_format_json = [{ "name": "old_format_case", "data": [ {"query": "Old Format Query", "reference": "Old Format Reference"} ], }] file_path = tmp_path / "old_format.json" with open(file_path, "w", encoding="utf-8") as f: json.dump(old_format_json, f) loaded_eval_set = load_eval_set_from_file( str(file_path), "old_format_eval_set" ) expected_eval_set = EvalSet( eval_set_id="old_format_eval_set", name="old_format_eval_set", creation_timestamp=mocked_time, eval_cases=[ EvalCase( eval_id="old_format_case", creation_timestamp=mocked_time, conversation=[ Invocation( invocation_id=mocked_invocation_id, user_content=genai_types.Content( parts=[genai_types.Part(text="Old Format Query")], role="user", ), final_response=genai_types.Content( parts=[ genai_types.Part(text="Old Format Reference") ], role="model", ), intermediate_data=IntermediateData( tool_uses=[], intermediate_responses=[], ), creation_timestamp=mocked_time, ) ], ) ], ) assert loaded_eval_set == expected_eval_set def test_load_eval_set_from_file_nonexistent_file(self): with pytest.raises(FileNotFoundError): load_eval_set_from_file("nonexistent_file.json", "test_eval_set") def test_load_eval_set_from_file_invalid_json(self, tmp_path): # Create a dummy file with invalid JSON file_path = tmp_path / "invalid.json" with open(file_path, "w", encoding="utf-8") as f: f.write("invalid json") with pytest.raises(json.JSONDecodeError): load_eval_set_from_file(str(file_path), "test_eval_set") def test_load_eval_set_from_file_invalid_data(self, tmp_path, mocker): # Create a dummy file with invalid data that fails both Pydantic validation # and the old format conversion. We mock the # convert_eval_set_to_pydanctic_schema function to raise a ValueError # so that we can assert that the exception is raised. file_path = tmp_path / "invalid_data.json" with open(file_path, "w", encoding="utf-8") as f: f.write('{"invalid": "data"}') mocker.patch( "google.adk.evaluation.local_eval_sets_manager.convert_eval_set_to_pydanctic_schema", side_effect=ValueError(), ) with pytest.raises(ValueError): load_eval_set_from_file(str(file_path), "test_eval_set") class TestLocalEvalSetsManager: """Tests for LocalEvalSetsManager.""" @pytest.fixture def local_eval_sets_manager(tmp_path): agents_dir = str(tmp_path) return LocalEvalSetsManager(agents_dir=agents_dir) def test_local_eval_sets_manager_get_eval_set_success( self, local_eval_sets_manager, mocker ): app_name = "test_app" eval_set_id = "test_eval_set" mock_eval_set = EvalSet(eval_set_id=eval_set_id, eval_cases=[]) mocker.patch( "google.adk.evaluation.local_eval_sets_manager.load_eval_set_from_file", return_value=mock_eval_set, ) mocker.patch("os.path.exists", return_value=True) eval_set = local_eval_sets_manager.get_eval_set(app_name, eval_set_id) assert eval_set == mock_eval_set def test_local_eval_sets_manager_get_eval_set_not_found( self, local_eval_sets_manager, mocker ): app_name = "test_app" eval_set_id = "test_eval_set" mocker.patch( "google.adk.evaluation.local_eval_sets_manager.load_eval_set_from_file", side_effect=FileNotFoundError, ) eval_set = local_eval_sets_manager.get_eval_set(app_name, eval_set_id) assert eval_set is None def test_local_eval_sets_manager_create_eval_set_success( self, local_eval_sets_manager, mocker ): mocked_time = 12345678 mocker.patch("time.time", return_value=mocked_time) app_name = "test_app" eval_set_id = "test_eval_set" mocker.patch("os.path.exists", return_value=False) mock_write_eval_set = mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager._write_eval_set" ) eval_set_file_path = os.path.join( local_eval_sets_manager._agents_dir, app_name, eval_set_id + _EVAL_SET_FILE_EXTENSION, ) local_eval_sets_manager.create_eval_set(app_name, eval_set_id) mock_write_eval_set.assert_called_once_with( eval_set_file_path, EvalSet( eval_set_id=eval_set_id, name=eval_set_id, eval_cases=[], creation_timestamp=mocked_time, ), ) def test_local_eval_sets_manager_create_eval_set_invalid_id( self, local_eval_sets_manager ): app_name = "test_app" eval_set_id = "invalid-id" with pytest.raises(ValueError, match="Invalid Eval Set Id"): local_eval_sets_manager.create_eval_set(app_name, eval_set_id) def test_local_eval_sets_manager_list_eval_sets_success( self, local_eval_sets_manager, mocker ): app_name = "test_app" mock_listdir_return = [ "eval_set_1.evalset.json", "eval_set_2.evalset.json", "not_an_eval_set.txt", ] mocker.patch("os.listdir", return_value=mock_listdir_return) mocker.patch("os.path.join", return_value="dummy_path") mocker.patch("os.path.basename", side_effect=lambda x: x) eval_sets = local_eval_sets_manager.list_eval_sets(app_name) assert eval_sets == ["eval_set_1", "eval_set_2"] def test_local_eval_sets_manager_add_eval_case_success( self, local_eval_sets_manager, mocker ): app_name = "test_app" eval_set_id = "test_eval_set" eval_case_id = "test_eval_case" mock_eval_case = EvalCase(eval_id=eval_case_id, conversation=[]) mock_eval_set = EvalSet(eval_set_id=eval_set_id, eval_cases=[]) mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager.get_eval_set", return_value=mock_eval_set, ) mock_write_eval_set = mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager._write_eval_set" ) local_eval_sets_manager.add_eval_case(app_name, eval_set_id, mock_eval_case) assert len(mock_eval_set.eval_cases) == 1 assert mock_eval_set.eval_cases[0] == mock_eval_case expected_eval_set_file_path = os.path.join( local_eval_sets_manager._agents_dir, app_name, eval_set_id + _EVAL_SET_FILE_EXTENSION, ) mock_eval_set.eval_cases.append(mock_eval_case) mock_write_eval_set.assert_called_once_with( expected_eval_set_file_path, mock_eval_set ) def test_local_eval_sets_manager_add_eval_case_eval_set_not_found( self, local_eval_sets_manager, mocker ): app_name = "test_app" eval_set_id = "test_eval_set" eval_case_id = "test_eval_case" mock_eval_case = EvalCase(eval_id=eval_case_id, conversation=[]) mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager.get_eval_set", return_value=None, ) with pytest.raises( NotFoundError, match="Eval set `test_eval_set` not found." ): local_eval_sets_manager.add_eval_case( app_name, eval_set_id, mock_eval_case ) def test_local_eval_sets_manager_add_eval_case_eval_case_id_exists( self, local_eval_sets_manager, mocker ): app_name = "test_app" eval_set_id = "test_eval_set" eval_case_id = "test_eval_case" mock_eval_case = EvalCase(eval_id=eval_case_id, conversation=[]) mock_eval_set = EvalSet( eval_set_id=eval_set_id, eval_cases=[mock_eval_case] ) mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager.get_eval_set", return_value=mock_eval_set, ) with pytest.raises( ValueError, match=( f"Eval id `{eval_case_id}` already exists in `{eval_set_id}` eval" " set." ), ): local_eval_sets_manager.add_eval_case( app_name, eval_set_id, mock_eval_case ) def test_local_eval_sets_manager_get_eval_case_success( self, local_eval_sets_manager, mocker ): app_name = "test_app" eval_set_id = "test_eval_set" eval_case_id = "test_eval_case" mock_eval_case = EvalCase(eval_id=eval_case_id, conversation=[]) mock_eval_set = EvalSet( eval_set_id=eval_set_id, eval_cases=[mock_eval_case] ) mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager.get_eval_set", return_value=mock_eval_set, ) eval_case = local_eval_sets_manager.get_eval_case( app_name, eval_set_id, eval_case_id ) assert eval_case == mock_eval_case def test_local_eval_sets_manager_get_eval_case_eval_set_not_found( self, local_eval_sets_manager, mocker ): app_name = "test_app" eval_set_id = "test_eval_set" eval_case_id = "test_eval_case" mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager.get_eval_set", return_value=None, ) eval_case = local_eval_sets_manager.get_eval_case( app_name, eval_set_id, eval_case_id ) assert eval_case is None def test_local_eval_sets_manager_get_eval_case_eval_case_not_found( self, local_eval_sets_manager, mocker ): app_name = "test_app" eval_set_id = "test_eval_set" eval_case_id = "test_eval_case" mock_eval_set = EvalSet(eval_set_id=eval_set_id, eval_cases=[]) mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager.get_eval_set", return_value=mock_eval_set, ) eval_case = local_eval_sets_manager.get_eval_case( app_name, eval_set_id, eval_case_id ) assert eval_case is None def test_local_eval_sets_manager_update_eval_case_success( self, local_eval_sets_manager, mocker ): app_name = "test_app" eval_set_id = "test_eval_set" eval_case_id = "test_eval_case" mock_eval_case = EvalCase( eval_id=eval_case_id, conversation=[], creation_timestamp=456 ) updated_eval_case = EvalCase( eval_id=eval_case_id, conversation=[], creation_timestamp=123 ) mock_eval_set = EvalSet( eval_set_id=eval_set_id, eval_cases=[mock_eval_case] ) mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager.get_eval_set", return_value=mock_eval_set, ) mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager.get_eval_case", return_value=mock_eval_case, ) mock_write_eval_set = mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager._write_eval_set" ) local_eval_sets_manager.update_eval_case( app_name, eval_set_id, updated_eval_case ) assert len(mock_eval_set.eval_cases) == 1 assert mock_eval_set.eval_cases[0] == updated_eval_case expected_eval_set_file_path = os.path.join( local_eval_sets_manager._agents_dir, app_name, eval_set_id + _EVAL_SET_FILE_EXTENSION, ) mock_write_eval_set.assert_called_once_with( expected_eval_set_file_path, EvalSet(eval_set_id=eval_set_id, eval_cases=[updated_eval_case]), ) def test_local_eval_sets_manager_update_eval_case_eval_case_not_found( self, local_eval_sets_manager, mocker ): app_name = "test_app" eval_set_id = "test_eval_set" eval_case_id = "test_eval_case" updated_eval_case = EvalCase(eval_id=eval_case_id, conversation=[]) mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager.get_eval_case", return_value=None, ) with pytest.raises( NotFoundError, match=( f"Eval Set `{eval_set_id}` or Eval id `{eval_case_id}` not found." ), ): local_eval_sets_manager.update_eval_case( app_name, eval_set_id, updated_eval_case ) def test_local_eval_sets_manager_delete_eval_case_success( self, local_eval_sets_manager, mocker ): app_name = "test_app" eval_set_id = "test_eval_set" eval_case_id = "test_eval_case" mock_eval_case = EvalCase(eval_id=eval_case_id, conversation=[]) mock_eval_set = EvalSet( eval_set_id=eval_set_id, eval_cases=[mock_eval_case] ) mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager.get_eval_set", return_value=mock_eval_set, ) mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager.get_eval_case", return_value=mock_eval_case, ) mock_write_eval_set = mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager._write_eval_set" ) local_eval_sets_manager.delete_eval_case( app_name, eval_set_id, eval_case_id ) assert len(mock_eval_set.eval_cases) == 0 expected_eval_set_file_path = os.path.join( local_eval_sets_manager._agents_dir, app_name, eval_set_id + _EVAL_SET_FILE_EXTENSION, ) mock_write_eval_set.assert_called_once_with( expected_eval_set_file_path, EvalSet(eval_set_id=eval_set_id, eval_cases=[]), ) def test_local_eval_sets_manager_delete_eval_case_eval_case_not_found( self, local_eval_sets_manager, mocker ): app_name = "test_app" eval_set_id = "test_eval_set" eval_case_id = "test_eval_case" mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager.get_eval_case", return_value=None, ) mock_write_eval_set = mocker.patch( "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager._write_eval_set" ) with pytest.raises( NotFoundError, match=( f"Eval Set `{eval_set_id}` or Eval id `{eval_case_id}` not found." ), ): local_eval_sets_manager.delete_eval_case( app_name, eval_set_id, eval_case_id ) mock_write_eval_set.assert_not_called()