refactor: refactor evaluation to make cli module depends on evaluation module. Modules outside of cli module should not reference cli module

PiperOrigin-RevId: 763577749
This commit is contained in:
Xiang (Sean) Zhou 2025-05-26 19:18:07 -07:00 committed by Copybara-Service
parent d43c80b718
commit 7c2df7e4f6
6 changed files with 178 additions and 92 deletions

View File

@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import importlib.util
import json
import logging
@ -22,98 +24,20 @@ from typing import AsyncGenerator
from typing import Optional
import uuid
from pydantic import Field
from ..agents import Agent
from ..artifacts.base_artifact_service import BaseArtifactService
from ..evaluation.eval_case import EvalCase
from ..evaluation.eval_case import Invocation
from ..evaluation.eval_metrics import EvalMetric
from ..evaluation.eval_metrics import EvalMetricResult
from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
from ..evaluation.eval_result import EvalCaseResult
from ..evaluation.evaluator import EvalStatus
from ..evaluation.evaluator import Evaluator
from ..sessions.base_session_service import BaseSessionService
from ..sessions.session import Session
from .utils import common
logger = logging.getLogger("google_adk." + __name__)
class EvalMetric(common.BaseModel):
"""A metric used to evaluate a particular aspect of an eval case."""
metric_name: str
"""The name of the metric."""
threshold: float
"""A threshold value. Each metric decides how to interpret this threshold."""
class EvalMetricResult(EvalMetric):
"""The actual computed score/value of a particular EvalMetric."""
score: Optional[float] = None
eval_status: EvalStatus
class EvalMetricResultPerInvocation(common.BaseModel):
"""Eval metric results per invocation."""
actual_invocation: Invocation
"""The actual invocation, usually obtained by inferencing the agent."""
expected_invocation: Invocation
"""The expected invocation, usually the reference or golden invocation."""
eval_metric_results: list[EvalMetricResult] = []
"""Eval resutls for each applicable metric."""
class EvalCaseResult(common.BaseModel):
"""Case-level evaluation results."""
eval_set_file: str = Field(
deprecated=True,
description="This field is deprecated, use eval_set_id instead.",
)
eval_set_id: str = ""
"""The eval set id."""
eval_id: str = ""
"""The eval case id."""
final_eval_status: EvalStatus
"""Final eval status for this eval case."""
eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]] = Field(
deprecated=True,
description=(
"This field is deprecated, use overall_eval_metric_results instead."
),
)
overall_eval_metric_results: list[EvalMetricResult]
"""Overall result for each metric for the entire eval case."""
eval_metric_result_per_invocation: list[EvalMetricResultPerInvocation]
"""Result for each metric on a per invocation basis."""
session_id: str
"""Session id of the session generated as result of inferencing/scraping stage of the eval."""
session_details: Optional[Session] = None
"""Session generated as result of inferencing/scraping stage of the eval."""
user_id: Optional[str] = None
"""User id used during inferencing/scraping stage of the eval."""
class EvalSetResult(common.BaseModel):
eval_set_result_id: str
eval_set_result_name: str
eval_set_id: str
eval_case_results: list[EvalCaseResult] = Field(default_factory=list)
creation_timestamp: float = 0.0
MISSING_EVAL_DEPENDENCIES_MESSAGE = (
"Eval module is not installed, please install via `pip install"
" google-adk[eval]`."
@ -227,8 +151,6 @@ async def run_evals(
"""
try:
from ..evaluation.agent_evaluator import EvaluationGenerator
from ..evaluation.response_evaluator import ResponseEvaluator
from ..evaluation.trajectory_evaluator import TrajectoryEvaluator
except ModuleNotFoundError as e:
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e

View File

@ -13,6 +13,8 @@
# limitations under the License.
from __future__ import annotations
import asyncio
from contextlib import asynccontextmanager
import importlib
@ -59,6 +61,10 @@ from ..agents.run_config import StreamingMode
from ..artifacts.in_memory_artifact_service import InMemoryArtifactService
from ..evaluation.eval_case import EvalCase
from ..evaluation.eval_case import SessionInput
from ..evaluation.eval_metrics import EvalMetric
from ..evaluation.eval_metrics import EvalMetricResult
from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
from ..evaluation.eval_result import EvalSetResult
from ..evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager
from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
from ..events.event import Event
@ -69,10 +75,6 @@ from ..sessions.in_memory_session_service import InMemorySessionService
from ..sessions.session import Session
from ..sessions.vertex_ai_session_service import VertexAiSessionService
from .cli_eval import EVAL_SESSION_ID_PREFIX
from .cli_eval import EvalMetric
from .cli_eval import EvalMetricResult
from .cli_eval import EvalMetricResultPerInvocation
from .cli_eval import EvalSetResult
from .cli_eval import EvalStatus
from .utils import cleanup
from .utils import common

View File

@ -0,0 +1,72 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from typing import Optional
from pydantic import alias_generators
from pydantic import BaseModel
from pydantic import ConfigDict
from .eval_case import Invocation
from .evaluator import EvalStatus
class EvalMetric(BaseModel):
"""A metric used to evaluate a particular aspect of an eval case."""
model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
)
model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
)
metric_name: str
"""The name of the metric."""
threshold: float
"""A threshold value. Each metric decides how to interpret this threshold."""
class EvalMetricResult(EvalMetric):
"""The actual computed score/value of a particular EvalMetric."""
model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
)
model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
)
score: Optional[float] = None
eval_status: EvalStatus
class EvalMetricResultPerInvocation(BaseModel):
"""Eval metric results per invocation."""
actual_invocation: Invocation
"""The actual invocation, usually obtained by inferencing the agent."""
expected_invocation: Invocation
"""The expected invocation, usually the reference or golden invocation."""
eval_metric_results: list[EvalMetricResult] = []
"""Eval resutls for each applicable metric."""

View File

@ -0,0 +1,86 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from typing import Optional
from pydantic import alias_generators
from pydantic import BaseModel
from pydantic import ConfigDict
from pydantic import Field
from ..sessions.session import Session
from .eval_metrics import EvalMetric
from .eval_metrics import EvalMetricResult
from .eval_metrics import EvalMetricResultPerInvocation
from .evaluator import EvalStatus
class EvalCaseResult(BaseModel):
"""Case level evaluation results."""
model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
)
eval_set_file: str = Field(
deprecated=True,
description="This field is deprecated, use eval_set_id instead.",
)
eval_set_id: str = ""
"""The eval set id."""
eval_id: str = ""
"""The eval case id."""
final_eval_status: EvalStatus
"""Final eval status for this eval case."""
eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]] = Field(
deprecated=True,
description=(
"This field is deprecated, use overall_eval_metric_results instead."
),
)
overall_eval_metric_results: list[EvalMetricResult]
"""Overall result for each metric for the entire eval case."""
eval_metric_result_per_invocation: list[EvalMetricResultPerInvocation]
"""Result for each metric on a per invocation basis."""
session_id: str
"""Session id of the session generated as result of inferencing/scraping stage of the eval."""
session_details: Optional[Session] = None
"""Session generated as result of inferencing/scraping stage of the eval."""
user_id: Optional[str] = None
"""User id used during inferencing/scraping stage of the eval."""
class EvalSetResult(BaseModel):
"""Eval set level evaluation results."""
model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
)
eval_set_result_id: str
eval_set_result_name: str
eval_set_id: str
eval_case_results: list[EvalCaseResult] = Field(default_factory=list)
creation_timestamp: float = 0.0

View File

@ -12,11 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from abc import ABC
from abc import abstractmethod
from ..cli.cli_eval import EvalCaseResult
from ..cli.cli_eval import EvalSetResult
from .eval_result import EvalCaseResult
from .eval_result import EvalSetResult
class EvalSetResultsManager(ABC):

View File

@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import json
import logging
import os
@ -19,8 +21,8 @@ import time
from typing_extensions import override
from ..cli.cli_eval import EvalCaseResult
from ..cli.cli_eval import EvalSetResult
from .eval_result import EvalCaseResult
from .eval_result import EvalSetResult
from .eval_set_results_manager import EvalSetResultsManager
logger = logging.getLogger("google_adk." + __name__)