Update AgentEvaluator to new new EvalSchema

PiperOrigin-RevId: 759293759
This commit is contained in:
Ankur Sharma
2025-05-15 14:08:39 -07:00
committed by Copybara-Service
parent bdd678db31
commit 4c6820e78c
9 changed files with 153 additions and 149 deletions

View File

@@ -18,8 +18,13 @@ from os import path
from typing import Dict
from typing import List
from typing import Union
import uuid
from .eval_set import EvalSet
from .evaluation_generator import EvaluationGenerator
from .evaluator import EvalStatus
from .evaluator import EvaluationResult
from .evaluator import Evaluator
from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
from .response_evaluator import ResponseEvaluator
from .trajectory_evaluator import TrajectoryEvaluator
@@ -75,6 +80,62 @@ class AgentEvaluator:
)
return DEFAULT_CRITERIA
@staticmethod
async def evaluate_eval_set(
agent_module: str,
eval_set: EvalSet,
criteria: dict[str, float],
num_runs=NUM_RUNS,
agent_name=None,
):
"""Evaluates an agent using the given EvalSet.
Args:
agent_module: The path to python module that contains the definition of
the agent. There is convention in place here, where the code is going to
look for 'root_agent' in the loaded module.
eval_set: The eval set.
criteria: Evauation criterias, a dictionary of metric names to their
respective thresholds.
num_runs: Number of times all entries in the eval dataset should be
assessed.
agent_name: The name of the agent.
"""
eval_case_responses_list = await EvaluationGenerator.generate_responses(
eval_set=eval_set,
agent_module_path=agent_module,
repeat_num=num_runs,
agent_name=agent_name,
)
for eval_case_responses in eval_case_responses_list:
actual_invocations = [
invocation
for invocations in eval_case_responses.responses
for invocation in invocations
]
expected_invocations = (
eval_case_responses.eval_case.conversation * num_runs
)
for metric_name, threshold in criteria.items():
metric_evaluator = AgentEvaluator._get_metric_evaluator(
metric_name=metric_name, threshold=threshold
)
evaluation_result: EvaluationResult = (
metric_evaluator.evaluate_invocations(
actual_invocations=actual_invocations,
expected_invocations=expected_invocations,
)
)
assert evaluation_result.overall_eval_status == EvalStatus.PASSED, (
f"`{eval_case_responses.eval_case.eval_id}`: "
f"{metric_name} for {agent_module} Failed. Expected {threshold},"
f" but got {evaluation_result.overall_score}."
)
@staticmethod
async def evaluate(
agent_module,
@@ -109,34 +170,32 @@ class AgentEvaluator:
else:
test_files = [eval_dataset_file_path_or_dir]
initial_session_state = {}
initial_session = {}
if initial_session_file:
with open(initial_session_file, "r") as f:
initial_session_state = json.loads(f.read())["state"]
initial_session = json.loads(f.read())
for test_file in test_files:
dataset = AgentEvaluator._load_dataset(test_file)[0]
data = AgentEvaluator._load_dataset(test_file)[0]
criteria = AgentEvaluator.find_config_for_test_file(test_file)
AgentEvaluator._validate_input([data], criteria)
AgentEvaluator._validate_input([dataset], criteria)
eval_data = {
"name": test_file,
"data": data,
"initial_session": initial_session,
}
evaluation_response = await AgentEvaluator._generate_responses(
agent_module,
[dataset],
num_runs,
agent_name=agent_name,
initial_session={"state": initial_session_state},
eval_set = convert_eval_set_to_pydanctic_schema(
eval_set_id=str(uuid.uuid4()), eval_set_in_json_format=[eval_data]
)
await AgentEvaluator.evaluate_eval_set(
agent_module=agent_module,
eval_set=eval_set,
criteria=criteria,
num_runs=num_runs,
agent_name=agent_name,
)
if AgentEvaluator._response_evaluation_required(criteria, [dataset]):
AgentEvaluator._evaluate_response_scores(
agent_module, evaluation_response, criteria
)
if AgentEvaluator._trajectory_evaluation_required(criteria, [dataset]):
AgentEvaluator._evaluate_tool_trajectory(
agent_module, evaluation_response, criteria
)
@staticmethod
def _load_dataset(
@@ -221,102 +280,13 @@ class AgentEvaluator:
)
@staticmethod
def _get_infer_criteria(eval_dataset):
"""Infers evaluation criteria based on the provided dataset.
def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
if metric_name == TOOL_TRAJECTORY_SCORE_KEY:
return TrajectoryEvaluator(threshold=threshold)
elif (
metric_name == RESPONSE_MATCH_SCORE_KEY
or metric_name == RESPONSE_EVALUATION_SCORE_KEY
):
return ResponseEvaluator(threshold=threshold, metric_name=metric_name)
Args:
eval_dataset (list): A list of evaluation samples.
Returns:
dict: Inferred evaluation criteria based on dataset fields.
"""
inferred_criteria = {}
sample = eval_dataset[0][0]
if QUERY_COLUMN in sample and EXPECTED_TOOL_USE_COLUMN in sample:
inferred_criteria[TOOL_TRAJECTORY_SCORE_KEY] = DEFAULT_CRITERIA[
TOOL_TRAJECTORY_SCORE_KEY
]
if QUERY_COLUMN in sample and REFERENCE_COLUMN in sample:
inferred_criteria[RESPONSE_MATCH_SCORE_KEY] = DEFAULT_CRITERIA[
RESPONSE_MATCH_SCORE_KEY
]
return inferred_criteria
@staticmethod
async def _generate_responses(
agent_module, eval_dataset, num_runs, agent_name=None, initial_session={}
):
"""Generates evaluation responses by running the agent module multiple times."""
return EvaluationGenerator.generate_responses(
eval_dataset,
agent_module,
repeat_num=num_runs,
agent_name=agent_name,
initial_session=initial_session,
)
@staticmethod
def _response_evaluation_required(criteria, eval_dataset):
"""Checks if response evaluation are needed."""
return REFERENCE_COLUMN in eval_dataset[0][0] and any(
key in criteria
for key in [RESPONSE_EVALUATION_SCORE_KEY, RESPONSE_MATCH_SCORE_KEY]
)
@staticmethod
def _trajectory_evaluation_required(evaluation_criteria, eval_dataset):
"""Checks if response evaluation are needed."""
return (
EXPECTED_TOOL_USE_COLUMN in eval_dataset[0][0]
and TOOL_TRAJECTORY_SCORE_KEY in evaluation_criteria
)
@staticmethod
def _evaluate_response_scores(agent_module, evaluation_response, criteria):
"""Evaluates response scores and raises an assertion error if they don't meet the criteria."""
metrics = ResponseEvaluator.evaluate(
evaluation_response, criteria, print_detailed_results=True
)
AgentEvaluator._assert_score(
metrics,
"coherence/mean",
criteria.get(RESPONSE_EVALUATION_SCORE_KEY),
"Average response evaluation score",
agent_module,
)
AgentEvaluator._assert_score(
metrics,
"rouge_1/mean",
criteria.get(RESPONSE_MATCH_SCORE_KEY),
"Average response match score",
agent_module,
)
@staticmethod
def _evaluate_tool_trajectory(agent_module, evaluation_response, criteria):
"""Evaluates tool trajectory scores and raises an assertion error if they don't meet the criteria."""
score = TrajectoryEvaluator.evaluate(
evaluation_response, print_detailed_results=True
)
AgentEvaluator._assert_score(
{TOOL_TRAJECTORY_SCORE_KEY: score},
TOOL_TRAJECTORY_SCORE_KEY,
criteria[TOOL_TRAJECTORY_SCORE_KEY],
"Average tool trajectory evaluation score",
agent_module,
)
@staticmethod
def _assert_score(metrics, metric_key, threshold, description, agent_module):
"""Asserts that a metric meets the specified threshold."""
if metric_key in metrics:
actual_score = metrics[metric_key]
assert actual_score >= threshold, (
f"{description} for {agent_module} is lower than expected. "
f"Expected >= {threshold}, but got {actual_score}."
)
raise ValueError(f"Unsupported eval metric: {metric_name}")