Define EvalReport data model.

PiperOrigin-RevId: 757933585
This commit is contained in:
Google Team Member 2025-05-12 15:22:22 -07:00 committed by Copybara-Service
parent 1237d5334f
commit 660c2efa4d
3 changed files with 23 additions and 11 deletions

View File

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import datetime
from enum import Enum from enum import Enum
import importlib.util import importlib.util
import json import json
@ -25,8 +26,10 @@ from typing import Optional
import uuid import uuid
from pydantic import BaseModel from pydantic import BaseModel
from pydantic import Field
from ..agents import Agent from ..agents import Agent
from ..sessions.session import Session
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -43,16 +46,25 @@ class EvalMetric(BaseModel):
class EvalMetricResult(BaseModel): class EvalMetricResult(BaseModel):
score: Optional[float] score: Optional[float] = None
eval_status: EvalStatus eval_status: EvalStatus
class EvalResult(BaseModel): class EvalCaseResult(BaseModel):
eval_set_file: str eval_set_file: str
eval_id: str eval_id: str
final_eval_status: EvalStatus final_eval_status: EvalStatus
eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]] eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]]
session_id: str session_id: str
session_details: Optional[Session] = None
class EvalSetResult(BaseModel):
eval_set_result_id: str
eval_set_result_name: str
eval_set_id: str
eval_case_results: list[EvalCaseResult] = Field(default_factory=list)
creation_timestamp: float = 0.0
MISSING_EVAL_DEPENDENCIES_MESSAGE = ( MISSING_EVAL_DEPENDENCIES_MESSAGE = (
@ -154,7 +166,7 @@ async def run_evals(
session_service=None, session_service=None,
artifact_service=None, artifact_service=None,
print_detailed_results=False, print_detailed_results=False,
) -> AsyncGenerator[EvalResult, None]: ) -> AsyncGenerator[EvalCaseResult, None]:
try: try:
from ..evaluation.agent_evaluator import EvaluationGenerator from ..evaluation.agent_evaluator import EvaluationGenerator
from ..evaluation.response_evaluator import ResponseEvaluator from ..evaluation.response_evaluator import ResponseEvaluator
@ -249,7 +261,7 @@ async def run_evals(
else: else:
raise ValueError("Unknown eval status.") raise ValueError("Unknown eval status.")
yield EvalResult( yield EvalCaseResult(
eval_set_file=eval_set_file, eval_set_file=eval_set_file,
eval_id=eval_name, eval_id=eval_name,
final_eval_status=final_eval_status, final_eval_status=final_eval_status,

View File

@ -245,7 +245,7 @@ def cli_eval(
try: try:
from .cli_eval import EvalMetric from .cli_eval import EvalMetric
from .cli_eval import EvalResult from .cli_eval import EvalCaseResult
from .cli_eval import EvalStatus from .cli_eval import EvalStatus
from .cli_eval import get_evaluation_criteria_or_default from .cli_eval import get_evaluation_criteria_or_default
from .cli_eval import get_root_agent from .cli_eval import get_root_agent
@ -269,7 +269,7 @@ def cli_eval(
eval_set_to_evals = parse_and_get_evals_to_run(eval_set_file_path) eval_set_to_evals = parse_and_get_evals_to_run(eval_set_file_path)
async def _collect_eval_results() -> list[EvalResult]: async def _collect_eval_results() -> list[EvalCaseResult]:
return [ return [
result result
async for result in run_evals( async for result in run_evals(
@ -290,7 +290,7 @@ def cli_eval(
eval_run_summary = {} eval_run_summary = {}
for eval_result in eval_results: for eval_result in eval_results:
eval_result: EvalResult eval_result: EvalCaseResult
if eval_result.eval_set_file not in eval_run_summary: if eval_result.eval_set_file not in eval_run_summary:
eval_run_summary[eval_result.eval_set_file] = [0, 0] eval_run_summary[eval_result.eval_set_file] = [0, 0]

View File

@ -250,7 +250,7 @@ def test_cli_eval_success_path(
def __init__(self, metric_name: str, threshold: float) -> None: def __init__(self, metric_name: str, threshold: float) -> None:
... ...
class _EvalResult: class _EvalCaseResult:
def __init__(self, eval_set_file: str, final_eval_status: str) -> None: def __init__(self, eval_set_file: str, final_eval_status: str) -> None:
self.eval_set_file = eval_set_file self.eval_set_file = eval_set_file
@ -261,7 +261,7 @@ def test_cli_eval_success_path(
# helper funcs # helper funcs
stub.EvalMetric = _EvalMetric stub.EvalMetric = _EvalMetric
stub.EvalResult = _EvalResult stub.EvalCaseResult = _EvalCaseResult
stub.EvalStatus = _EvalStatus stub.EvalStatus = _EvalStatus
stub.MISSING_EVAL_DEPENDENCIES_MESSAGE = "stub msg" stub.MISSING_EVAL_DEPENDENCIES_MESSAGE = "stub msg"
@ -272,8 +272,8 @@ def test_cli_eval_success_path(
# Create an async generator function for run_evals # Create an async generator function for run_evals
async def mock_run_evals(*_a, **_k): async def mock_run_evals(*_a, **_k):
yield _EvalResult("set1.json", "PASSED") yield _EvalCaseResult("set1.json", "PASSED")
yield _EvalResult("set1.json", "FAILED") yield _EvalCaseResult("set1.json", "FAILED")
stub.run_evals = mock_run_evals stub.run_evals = mock_run_evals