Update Eval Run and TrajectoryEvaluator to use the new schema.

PiperOrigin-RevId: 758927160
This commit is contained in:
Ankur Sharma
2025-05-14 19:15:52 -07:00
committed by Copybara-Service
parent 2cb74dd20e
commit ee674ce0ef
9 changed files with 418 additions and 244 deletions
+31 -12
View File
@@ -296,6 +296,7 @@ def cli_eval(
from .cli_eval import parse_and_get_evals_to_run
from .cli_eval import run_evals
from .cli_eval import try_get_reset_func
from ..evaluation.local_eval_sets_manager import load_eval_set_from_file
except ModuleNotFoundError:
raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE)
@@ -311,17 +312,27 @@ def cli_eval(
root_agent = get_root_agent(agent_module_file_path)
reset_func = try_get_reset_func(agent_module_file_path)
eval_set_to_evals = parse_and_get_evals_to_run(eval_set_file_path)
eval_set_file_path_to_evals = parse_and_get_evals_to_run(eval_set_file_path)
eval_set_id_to_eval_cases = {}
# Read the eval_set files and get the cases.
for eval_set_file_path, eval_case_ids in eval_set_file_path_to_evals.items():
eval_set = load_eval_set_from_file(eval_set_file_path, eval_set_file_path)
eval_cases = eval_set.eval_cases
if eval_case_ids:
# There are eval_ids that we should select.
eval_cases = [
e for e in eval_set.eval_cases if e.eval_id in eval_case_ids
]
eval_set_id_to_eval_cases[eval_set_file_path] = eval_cases
async def _collect_eval_results() -> list[EvalCaseResult]:
return [
result
async for result in run_evals(
eval_set_to_evals,
root_agent,
reset_func,
eval_metrics,
print_detailed_results=print_detailed_results,
eval_set_id_to_eval_cases, root_agent, reset_func, eval_metrics
)
]
@@ -336,20 +347,28 @@ def cli_eval(
for eval_result in eval_results:
eval_result: EvalCaseResult
if eval_result.eval_set_file not in eval_run_summary:
eval_run_summary[eval_result.eval_set_file] = [0, 0]
if eval_result.eval_set_id not in eval_run_summary:
eval_run_summary[eval_result.eval_set_id] = [0, 0]
if eval_result.final_eval_status == EvalStatus.PASSED:
eval_run_summary[eval_result.eval_set_file][0] += 1
eval_run_summary[eval_result.eval_set_id][0] += 1
else:
eval_run_summary[eval_result.eval_set_file][1] += 1
eval_run_summary[eval_result.eval_set_id][1] += 1
print("Eval Run Summary")
for eval_set_file, pass_fail_count in eval_run_summary.items():
for eval_set_id, pass_fail_count in eval_run_summary.items():
print(
f"{eval_set_file}:\n Tests passed: {pass_fail_count[0]}\n Tests"
f"{eval_set_id}:\n Tests passed: {pass_fail_count[0]}\n Tests"
f" failed: {pass_fail_count[1]}"
)
if print_detailed_results:
for eval_result in eval_results:
eval_result: EvalCaseResult
print(
"*********************************************************************"
)
print(eval_result.model_dump_json(indent=2))
@main.command("web")
@click.option(