# Copyright 2025 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Evaluate all agents in fixture folder if evaluation test files exist.""" import os from google.adk.evaluation import AgentEvaluator import pytest def agent_eval_artifacts_in_fixture(): """Get all agents from fixture folder.""" agent_eval_artifacts = [] fixture_dir = os.path.join(os.path.dirname(__file__), 'fixture') for agent_name in os.listdir(fixture_dir): agent_dir = os.path.join(fixture_dir, agent_name) if not os.path.isdir(agent_dir): continue for filename in os.listdir(agent_dir): # Evaluation test files end with test.json if not filename.endswith('test.json'): continue initial_session_file = ( f'tests/integration/fixture/{agent_name}/initial.session.json' ) agent_eval_artifacts.append(( f'tests.integration.fixture.{agent_name}', f'tests/integration/fixture/{agent_name}/{filename}', initial_session_file if os.path.exists(initial_session_file) else None, )) # This method gets invoked twice, sorting helps ensure that both the # invocations have the same view. agent_eval_artifacts = sorted( agent_eval_artifacts, key=lambda item: f'{item[0]}|{item[1]}' ) return agent_eval_artifacts @pytest.mark.parametrize( 'agent_name, evalfile, initial_session_file', agent_eval_artifacts_in_fixture(), ids=[agent_name for agent_name, _, _ in agent_eval_artifacts_in_fixture()], ) def test_evaluate_agents_long_running_4_runs_per_eval_item( agent_name, evalfile, initial_session_file ): """Test agents evaluation in fixture folder. After querying the fixture folder, we have 5 eval items. For each eval item we use 4 runs. A single eval item is a session that can have multiple queries in it. """ AgentEvaluator.evaluate( agent_module=agent_name, eval_dataset_file_path_or_dir=evalfile, initial_session_file=initial_session_file, # Using a slightly higher value helps us manange the variances that may # happen in each eval. # This, of course, comes at a cost of incrased test run times. num_runs=4, )