diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py index 7534ad2..13e205c 100644 --- a/src/google/adk/cli/cli_eval.py +++ b/src/google/adk/cli/cli_eval.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import importlib.util import json import logging @@ -22,98 +24,20 @@ from typing import AsyncGenerator from typing import Optional import uuid -from pydantic import Field - from ..agents import Agent from ..artifacts.base_artifact_service import BaseArtifactService from ..evaluation.eval_case import EvalCase -from ..evaluation.eval_case import Invocation +from ..evaluation.eval_metrics import EvalMetric +from ..evaluation.eval_metrics import EvalMetricResult +from ..evaluation.eval_metrics import EvalMetricResultPerInvocation +from ..evaluation.eval_result import EvalCaseResult from ..evaluation.evaluator import EvalStatus from ..evaluation.evaluator import Evaluator from ..sessions.base_session_service import BaseSessionService -from ..sessions.session import Session -from .utils import common logger = logging.getLogger("google_adk." + __name__) -class EvalMetric(common.BaseModel): - """A metric used to evaluate a particular aspect of an eval case.""" - - metric_name: str - """The name of the metric.""" - - threshold: float - """A threshold value. Each metric decides how to interpret this threshold.""" - - -class EvalMetricResult(EvalMetric): - """The actual computed score/value of a particular EvalMetric.""" - - score: Optional[float] = None - eval_status: EvalStatus - - -class EvalMetricResultPerInvocation(common.BaseModel): - """Eval metric results per invocation.""" - - actual_invocation: Invocation - """The actual invocation, usually obtained by inferencing the agent.""" - - expected_invocation: Invocation - """The expected invocation, usually the reference or golden invocation.""" - - eval_metric_results: list[EvalMetricResult] = [] - """Eval resutls for each applicable metric.""" - - -class EvalCaseResult(common.BaseModel): - """Case-level evaluation results.""" - - eval_set_file: str = Field( - deprecated=True, - description="This field is deprecated, use eval_set_id instead.", - ) - eval_set_id: str = "" - """The eval set id.""" - - eval_id: str = "" - """The eval case id.""" - - final_eval_status: EvalStatus - """Final eval status for this eval case.""" - - eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]] = Field( - deprecated=True, - description=( - "This field is deprecated, use overall_eval_metric_results instead." - ), - ) - - overall_eval_metric_results: list[EvalMetricResult] - """Overall result for each metric for the entire eval case.""" - - eval_metric_result_per_invocation: list[EvalMetricResultPerInvocation] - """Result for each metric on a per invocation basis.""" - - session_id: str - """Session id of the session generated as result of inferencing/scraping stage of the eval.""" - - session_details: Optional[Session] = None - """Session generated as result of inferencing/scraping stage of the eval.""" - - user_id: Optional[str] = None - """User id used during inferencing/scraping stage of the eval.""" - - -class EvalSetResult(common.BaseModel): - eval_set_result_id: str - eval_set_result_name: str - eval_set_id: str - eval_case_results: list[EvalCaseResult] = Field(default_factory=list) - creation_timestamp: float = 0.0 - - MISSING_EVAL_DEPENDENCIES_MESSAGE = ( "Eval module is not installed, please install via `pip install" " google-adk[eval]`." @@ -227,8 +151,6 @@ async def run_evals( """ try: from ..evaluation.agent_evaluator import EvaluationGenerator - from ..evaluation.response_evaluator import ResponseEvaluator - from ..evaluation.trajectory_evaluator import TrajectoryEvaluator except ModuleNotFoundError as e: raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e diff --git a/src/google/adk/cli/fast_api.py b/src/google/adk/cli/fast_api.py index a4a4b61..5090f05 100644 --- a/src/google/adk/cli/fast_api.py +++ b/src/google/adk/cli/fast_api.py @@ -13,6 +13,8 @@ # limitations under the License. +from __future__ import annotations + import asyncio from contextlib import asynccontextmanager import importlib @@ -59,6 +61,10 @@ from ..agents.run_config import StreamingMode from ..artifacts.in_memory_artifact_service import InMemoryArtifactService from ..evaluation.eval_case import EvalCase from ..evaluation.eval_case import SessionInput +from ..evaluation.eval_metrics import EvalMetric +from ..evaluation.eval_metrics import EvalMetricResult +from ..evaluation.eval_metrics import EvalMetricResultPerInvocation +from ..evaluation.eval_result import EvalSetResult from ..evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager from ..events.event import Event @@ -69,10 +75,6 @@ from ..sessions.in_memory_session_service import InMemorySessionService from ..sessions.session import Session from ..sessions.vertex_ai_session_service import VertexAiSessionService from .cli_eval import EVAL_SESSION_ID_PREFIX -from .cli_eval import EvalMetric -from .cli_eval import EvalMetricResult -from .cli_eval import EvalMetricResultPerInvocation -from .cli_eval import EvalSetResult from .cli_eval import EvalStatus from .utils import cleanup from .utils import common diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py new file mode 100644 index 0000000..505c8bb --- /dev/null +++ b/src/google/adk/evaluation/eval_metrics.py @@ -0,0 +1,72 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Optional + +from pydantic import alias_generators +from pydantic import BaseModel +from pydantic import ConfigDict + +from .eval_case import Invocation +from .evaluator import EvalStatus + + +class EvalMetric(BaseModel): + """A metric used to evaluate a particular aspect of an eval case.""" + + model_config = ConfigDict( + alias_generator=alias_generators.to_camel, + populate_by_name=True, + ) + + model_config = ConfigDict( + alias_generator=alias_generators.to_camel, + populate_by_name=True, + ) + + metric_name: str + """The name of the metric.""" + + threshold: float + """A threshold value. Each metric decides how to interpret this threshold.""" + + +class EvalMetricResult(EvalMetric): + """The actual computed score/value of a particular EvalMetric.""" + + model_config = ConfigDict( + alias_generator=alias_generators.to_camel, + populate_by_name=True, + ) + model_config = ConfigDict( + alias_generator=alias_generators.to_camel, + populate_by_name=True, + ) + score: Optional[float] = None + eval_status: EvalStatus + + +class EvalMetricResultPerInvocation(BaseModel): + """Eval metric results per invocation.""" + + actual_invocation: Invocation + """The actual invocation, usually obtained by inferencing the agent.""" + + expected_invocation: Invocation + """The expected invocation, usually the reference or golden invocation.""" + + eval_metric_results: list[EvalMetricResult] = [] + """Eval resutls for each applicable metric.""" diff --git a/src/google/adk/evaluation/eval_result.py b/src/google/adk/evaluation/eval_result.py new file mode 100644 index 0000000..8f87a14 --- /dev/null +++ b/src/google/adk/evaluation/eval_result.py @@ -0,0 +1,86 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Optional + +from pydantic import alias_generators +from pydantic import BaseModel +from pydantic import ConfigDict +from pydantic import Field + +from ..sessions.session import Session +from .eval_metrics import EvalMetric +from .eval_metrics import EvalMetricResult +from .eval_metrics import EvalMetricResultPerInvocation +from .evaluator import EvalStatus + + +class EvalCaseResult(BaseModel): + """Case level evaluation results.""" + + model_config = ConfigDict( + alias_generator=alias_generators.to_camel, + populate_by_name=True, + ) + + eval_set_file: str = Field( + deprecated=True, + description="This field is deprecated, use eval_set_id instead.", + ) + eval_set_id: str = "" + """The eval set id.""" + + eval_id: str = "" + """The eval case id.""" + + final_eval_status: EvalStatus + """Final eval status for this eval case.""" + + eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]] = Field( + deprecated=True, + description=( + "This field is deprecated, use overall_eval_metric_results instead." + ), + ) + + overall_eval_metric_results: list[EvalMetricResult] + """Overall result for each metric for the entire eval case.""" + + eval_metric_result_per_invocation: list[EvalMetricResultPerInvocation] + """Result for each metric on a per invocation basis.""" + + session_id: str + """Session id of the session generated as result of inferencing/scraping stage of the eval.""" + + session_details: Optional[Session] = None + """Session generated as result of inferencing/scraping stage of the eval.""" + + user_id: Optional[str] = None + """User id used during inferencing/scraping stage of the eval.""" + + +class EvalSetResult(BaseModel): + """Eval set level evaluation results.""" + + model_config = ConfigDict( + alias_generator=alias_generators.to_camel, + populate_by_name=True, + ) + eval_set_result_id: str + eval_set_result_name: str + eval_set_id: str + eval_case_results: list[EvalCaseResult] = Field(default_factory=list) + creation_timestamp: float = 0.0 diff --git a/src/google/adk/evaluation/eval_set_results_manager.py b/src/google/adk/evaluation/eval_set_results_manager.py index cfb6a93..5a300ed 100644 --- a/src/google/adk/evaluation/eval_set_results_manager.py +++ b/src/google/adk/evaluation/eval_set_results_manager.py @@ -12,11 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + from abc import ABC from abc import abstractmethod -from ..cli.cli_eval import EvalCaseResult -from ..cli.cli_eval import EvalSetResult +from .eval_result import EvalCaseResult +from .eval_result import EvalSetResult class EvalSetResultsManager(ABC): diff --git a/src/google/adk/evaluation/local_eval_set_results_manager.py b/src/google/adk/evaluation/local_eval_set_results_manager.py index f18e984..598af7f 100644 --- a/src/google/adk/evaluation/local_eval_set_results_manager.py +++ b/src/google/adk/evaluation/local_eval_set_results_manager.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import json import logging import os @@ -19,8 +21,8 @@ import time from typing_extensions import override -from ..cli.cli_eval import EvalCaseResult -from ..cli.cli_eval import EvalSetResult +from .eval_result import EvalCaseResult +from .eval_result import EvalSetResult from .eval_set_results_manager import EvalSetResultsManager logger = logging.getLogger("google_adk." + __name__)