mirror of
https://github.com/EvolutionAPI/adk-python.git
synced 2025-07-15 11:42:54 -06:00
Define EvalReport data model.
PiperOrigin-RevId: 757933585
This commit is contained in:
parent
1237d5334f
commit
660c2efa4d
@ -12,6 +12,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
import datetime
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import importlib.util
|
import importlib.util
|
||||||
import json
|
import json
|
||||||
@ -25,8 +26,10 @@ from typing import Optional
|
|||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
from ..agents import Agent
|
from ..agents import Agent
|
||||||
|
from ..sessions.session import Session
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -43,16 +46,25 @@ class EvalMetric(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class EvalMetricResult(BaseModel):
|
class EvalMetricResult(BaseModel):
|
||||||
score: Optional[float]
|
score: Optional[float] = None
|
||||||
eval_status: EvalStatus
|
eval_status: EvalStatus
|
||||||
|
|
||||||
|
|
||||||
class EvalResult(BaseModel):
|
class EvalCaseResult(BaseModel):
|
||||||
eval_set_file: str
|
eval_set_file: str
|
||||||
eval_id: str
|
eval_id: str
|
||||||
final_eval_status: EvalStatus
|
final_eval_status: EvalStatus
|
||||||
eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]]
|
eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]]
|
||||||
session_id: str
|
session_id: str
|
||||||
|
session_details: Optional[Session] = None
|
||||||
|
|
||||||
|
|
||||||
|
class EvalSetResult(BaseModel):
|
||||||
|
eval_set_result_id: str
|
||||||
|
eval_set_result_name: str
|
||||||
|
eval_set_id: str
|
||||||
|
eval_case_results: list[EvalCaseResult] = Field(default_factory=list)
|
||||||
|
creation_timestamp: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
MISSING_EVAL_DEPENDENCIES_MESSAGE = (
|
MISSING_EVAL_DEPENDENCIES_MESSAGE = (
|
||||||
@ -154,7 +166,7 @@ async def run_evals(
|
|||||||
session_service=None,
|
session_service=None,
|
||||||
artifact_service=None,
|
artifact_service=None,
|
||||||
print_detailed_results=False,
|
print_detailed_results=False,
|
||||||
) -> AsyncGenerator[EvalResult, None]:
|
) -> AsyncGenerator[EvalCaseResult, None]:
|
||||||
try:
|
try:
|
||||||
from ..evaluation.agent_evaluator import EvaluationGenerator
|
from ..evaluation.agent_evaluator import EvaluationGenerator
|
||||||
from ..evaluation.response_evaluator import ResponseEvaluator
|
from ..evaluation.response_evaluator import ResponseEvaluator
|
||||||
@ -249,7 +261,7 @@ async def run_evals(
|
|||||||
else:
|
else:
|
||||||
raise ValueError("Unknown eval status.")
|
raise ValueError("Unknown eval status.")
|
||||||
|
|
||||||
yield EvalResult(
|
yield EvalCaseResult(
|
||||||
eval_set_file=eval_set_file,
|
eval_set_file=eval_set_file,
|
||||||
eval_id=eval_name,
|
eval_id=eval_name,
|
||||||
final_eval_status=final_eval_status,
|
final_eval_status=final_eval_status,
|
||||||
|
@ -245,7 +245,7 @@ def cli_eval(
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
from .cli_eval import EvalMetric
|
from .cli_eval import EvalMetric
|
||||||
from .cli_eval import EvalResult
|
from .cli_eval import EvalCaseResult
|
||||||
from .cli_eval import EvalStatus
|
from .cli_eval import EvalStatus
|
||||||
from .cli_eval import get_evaluation_criteria_or_default
|
from .cli_eval import get_evaluation_criteria_or_default
|
||||||
from .cli_eval import get_root_agent
|
from .cli_eval import get_root_agent
|
||||||
@ -269,7 +269,7 @@ def cli_eval(
|
|||||||
|
|
||||||
eval_set_to_evals = parse_and_get_evals_to_run(eval_set_file_path)
|
eval_set_to_evals = parse_and_get_evals_to_run(eval_set_file_path)
|
||||||
|
|
||||||
async def _collect_eval_results() -> list[EvalResult]:
|
async def _collect_eval_results() -> list[EvalCaseResult]:
|
||||||
return [
|
return [
|
||||||
result
|
result
|
||||||
async for result in run_evals(
|
async for result in run_evals(
|
||||||
@ -290,7 +290,7 @@ def cli_eval(
|
|||||||
eval_run_summary = {}
|
eval_run_summary = {}
|
||||||
|
|
||||||
for eval_result in eval_results:
|
for eval_result in eval_results:
|
||||||
eval_result: EvalResult
|
eval_result: EvalCaseResult
|
||||||
|
|
||||||
if eval_result.eval_set_file not in eval_run_summary:
|
if eval_result.eval_set_file not in eval_run_summary:
|
||||||
eval_run_summary[eval_result.eval_set_file] = [0, 0]
|
eval_run_summary[eval_result.eval_set_file] = [0, 0]
|
||||||
|
@ -250,7 +250,7 @@ def test_cli_eval_success_path(
|
|||||||
def __init__(self, metric_name: str, threshold: float) -> None:
|
def __init__(self, metric_name: str, threshold: float) -> None:
|
||||||
...
|
...
|
||||||
|
|
||||||
class _EvalResult:
|
class _EvalCaseResult:
|
||||||
|
|
||||||
def __init__(self, eval_set_file: str, final_eval_status: str) -> None:
|
def __init__(self, eval_set_file: str, final_eval_status: str) -> None:
|
||||||
self.eval_set_file = eval_set_file
|
self.eval_set_file = eval_set_file
|
||||||
@ -261,7 +261,7 @@ def test_cli_eval_success_path(
|
|||||||
|
|
||||||
# helper funcs
|
# helper funcs
|
||||||
stub.EvalMetric = _EvalMetric
|
stub.EvalMetric = _EvalMetric
|
||||||
stub.EvalResult = _EvalResult
|
stub.EvalCaseResult = _EvalCaseResult
|
||||||
stub.EvalStatus = _EvalStatus
|
stub.EvalStatus = _EvalStatus
|
||||||
stub.MISSING_EVAL_DEPENDENCIES_MESSAGE = "stub msg"
|
stub.MISSING_EVAL_DEPENDENCIES_MESSAGE = "stub msg"
|
||||||
|
|
||||||
@ -272,8 +272,8 @@ def test_cli_eval_success_path(
|
|||||||
|
|
||||||
# Create an async generator function for run_evals
|
# Create an async generator function for run_evals
|
||||||
async def mock_run_evals(*_a, **_k):
|
async def mock_run_evals(*_a, **_k):
|
||||||
yield _EvalResult("set1.json", "PASSED")
|
yield _EvalCaseResult("set1.json", "PASSED")
|
||||||
yield _EvalResult("set1.json", "FAILED")
|
yield _EvalCaseResult("set1.json", "FAILED")
|
||||||
|
|
||||||
stub.run_evals = mock_run_evals
|
stub.run_evals = mock_run_evals
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user