Add 'get_eval_report' and 'list_eval_reports' endpoints.

PiperOrigin-RevId: 757936497
This commit is contained in:
Google Team Member 2025-05-12 15:30:19 -07:00 committed by Copybara-Service
parent df0892a7b8
commit 05a0c6b307
2 changed files with 118 additions and 17 deletions

View File

@ -57,6 +57,7 @@ class EvalCaseResult(BaseModel):
eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]] eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]]
session_id: str session_id: str
session_details: Optional[Session] = None session_details: Optional[Session] = None
user_id: Optional[str] = None
class EvalSetResult(BaseModel): class EvalSetResult(BaseModel):
@ -185,6 +186,7 @@ async def run_evals(
eval_name = eval_item["name"] eval_name = eval_item["name"]
eval_data = eval_item["data"] eval_data = eval_item["data"]
initial_session = eval_item.get("initial_session", {}) initial_session = eval_item.get("initial_session", {})
user_id = initial_session.get("user_id", "test_user_id")
if evals_to_run and eval_name not in evals_to_run: if evals_to_run and eval_name not in evals_to_run:
continue continue
@ -267,6 +269,7 @@ async def run_evals(
final_eval_status=final_eval_status, final_eval_status=final_eval_status,
eval_metric_results=eval_metric_results, eval_metric_results=eval_metric_results,
session_id=session_id, session_id=session_id,
user_id=user_id,
) )
if final_eval_status == EvalStatus.PASSED: if final_eval_status == EvalStatus.PASSED:

View File

@ -22,13 +22,13 @@ import os
from pathlib import Path from pathlib import Path
import re import re
import sys import sys
import time
import traceback import traceback
import typing import typing
from typing import Any from typing import Any
from typing import List from typing import List
from typing import Literal from typing import Literal
from typing import Optional from typing import Optional
from typing import Union
import click import click
from fastapi import FastAPI from fastapi import FastAPI
@ -71,8 +71,10 @@ from ..sessions.session import Session
from ..sessions.vertex_ai_session_service import VertexAiSessionService from ..sessions.vertex_ai_session_service import VertexAiSessionService
from ..tools.base_toolset import BaseToolset from ..tools.base_toolset import BaseToolset
from .cli_eval import EVAL_SESSION_ID_PREFIX from .cli_eval import EVAL_SESSION_ID_PREFIX
from .cli_eval import EvalCaseResult
from .cli_eval import EvalMetric from .cli_eval import EvalMetric
from .cli_eval import EvalMetricResult from .cli_eval import EvalMetricResult
from .cli_eval import EvalSetResult
from .cli_eval import EvalStatus from .cli_eval import EvalStatus
from .utils import create_empty_state from .utils import create_empty_state
from .utils import envs from .utils import envs
@ -81,6 +83,7 @@ from .utils import evals
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_EVAL_SET_FILE_EXTENSION = ".evalset.json" _EVAL_SET_FILE_EXTENSION = ".evalset.json"
_EVAL_SET_RESULT_FILE_EXTENSION = ".evalset_result.json"
class ApiServerSpanExporter(export.SpanExporter): class ApiServerSpanExporter(export.SpanExporter):
@ -137,10 +140,12 @@ class RunEvalResult(BaseModel):
populate_by_name=True, populate_by_name=True,
) )
eval_set_file: str
eval_set_id: str eval_set_id: str
eval_id: str eval_id: str
final_eval_status: EvalStatus final_eval_status: EvalStatus
eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]] eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]]
user_id: str
session_id: str session_id: str
@ -484,15 +489,8 @@ def get_fast_api_app(
"Eval ids to run list is empty. We will all evals in the eval set." "Eval ids to run list is empty. We will all evals in the eval set."
) )
root_agent = await _get_root_agent_async(app_name) root_agent = await _get_root_agent_async(app_name)
return [ run_eval_results = []
RunEvalResult( eval_case_results = []
app_name=app_name,
eval_set_id=eval_set_id,
eval_id=eval_result.eval_id,
final_eval_status=eval_result.final_eval_status,
eval_metric_results=eval_result.eval_metric_results,
session_id=eval_result.session_id,
)
async for eval_result in run_evals( async for eval_result in run_evals(
eval_set_to_evals, eval_set_to_evals,
root_agent, root_agent,
@ -500,8 +498,108 @@ def get_fast_api_app(
req.eval_metrics, req.eval_metrics,
session_service=session_service, session_service=session_service,
artifact_service=artifact_service, artifact_service=artifact_service,
):
run_eval_results.append(
RunEvalResult(
app_name=app_name,
eval_set_file=eval_result.eval_set_file,
eval_set_id=eval_set_id,
eval_id=eval_result.eval_id,
final_eval_status=eval_result.final_eval_status,
eval_metric_results=eval_result.eval_metric_results,
user_id=eval_result.user_id,
session_id=eval_result.session_id,
) )
)
session = session_service.get_session(
app_name=app_name,
user_id=eval_result.user_id,
session_id=eval_result.session_id,
)
eval_case_results.append(
EvalCaseResult(
eval_set_file=eval_result.eval_set_file,
eval_id=eval_result.eval_id,
final_eval_status=eval_result.final_eval_status,
eval_metric_results=eval_result.eval_metric_results,
session_id=eval_result.session_id,
session_details=session,
user_id=eval_result.user_id,
)
)
timestamp = time.time()
eval_set_result_name = app_name + "_" + eval_set_id + "_" + str(timestamp)
eval_set_result = EvalSetResult(
eval_set_result_id=eval_set_result_name,
eval_set_result_name=eval_set_result_name,
eval_set_id=eval_set_id,
eval_case_results=eval_case_results,
creation_timestamp=timestamp,
)
# Write eval result file, with eval_set_result_name.
app_eval_history_dir = os.path.join(
agent_dir, app_name, ".adk", "eval_history"
)
if not os.path.exists(app_eval_history_dir):
os.makedirs(app_eval_history_dir)
# Convert to json and write to file.
eval_set_result_json = eval_set_result.model_dump_json()
eval_set_result_file_path = os.path.join(
app_eval_history_dir,
eval_set_result_name + _EVAL_SET_RESULT_FILE_EXTENSION,
)
logger.info("Writing eval result to file: %s", eval_set_result_file_path)
with open(eval_set_result_file_path, "w") as f:
f.write(json.dumps(eval_set_result_json, indent=2))
return run_eval_results
@app.get(
"/apps/{app_name}/eval_results/{eval_result_id}",
response_model_exclude_none=True,
)
def get_eval_result(
app_name: str,
eval_result_id: str,
) -> EvalSetResult:
"""Gets the eval result for the given eval id."""
# Load the eval set file data
maybe_eval_result_file_path = (
os.path.join(
agent_dir, app_name, ".adk", "eval_history", eval_result_id
)
+ _EVAL_SET_RESULT_FILE_EXTENSION
)
if not os.path.exists(maybe_eval_result_file_path):
raise HTTPException(
status_code=404,
detail=f"Eval result `{eval_result_id}` not found.",
)
with open(maybe_eval_result_file_path, "r") as file:
eval_result_data = json.load(file) # Load JSON into a list
try:
eval_result = EvalSetResult.model_validate_json(eval_result_data)
return eval_result
except ValidationError as e:
logger.exception("get_eval_result validation error: %s", e)
@app.get(
"/apps/{app_name}/eval_results",
response_model_exclude_none=True,
)
def list_eval_results(app_name: str) -> list[str]:
"""Lists all eval results for the given app."""
app_eval_history_directory = os.path.join(
agent_dir, app_name, ".adk", "eval_history"
)
eval_result_files = [
file.removesuffix(_EVAL_SET_RESULT_FILE_EXTENSION)
for file in os.listdir(app_eval_history_directory)
if file.endswith(_EVAL_SET_RESULT_FILE_EXTENSION)
] ]
return eval_result_files
@app.delete("/apps/{app_name}/users/{user_id}/sessions/{session_id}") @app.delete("/apps/{app_name}/users/{user_id}/sessions/{session_id}")
def delete_session(app_name: str, user_id: str, session_id: str): def delete_session(app_name: str, user_id: str, session_id: str):