mirror of
https://github.com/EvolutionAPI/adk-python.git
synced 2025-07-16 04:02:55 -06:00
refactor: refactor evaluation to make cli module depends on evaluation module. Modules outside of cli module should not reference cli module
PiperOrigin-RevId: 763577749
This commit is contained in:
parent
d43c80b718
commit
7c2df7e4f6
@ -12,6 +12,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import importlib.util
|
import importlib.util
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
@ -22,98 +24,20 @@ from typing import AsyncGenerator
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
from pydantic import Field
|
|
||||||
|
|
||||||
from ..agents import Agent
|
from ..agents import Agent
|
||||||
from ..artifacts.base_artifact_service import BaseArtifactService
|
from ..artifacts.base_artifact_service import BaseArtifactService
|
||||||
from ..evaluation.eval_case import EvalCase
|
from ..evaluation.eval_case import EvalCase
|
||||||
from ..evaluation.eval_case import Invocation
|
from ..evaluation.eval_metrics import EvalMetric
|
||||||
|
from ..evaluation.eval_metrics import EvalMetricResult
|
||||||
|
from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
|
||||||
|
from ..evaluation.eval_result import EvalCaseResult
|
||||||
from ..evaluation.evaluator import EvalStatus
|
from ..evaluation.evaluator import EvalStatus
|
||||||
from ..evaluation.evaluator import Evaluator
|
from ..evaluation.evaluator import Evaluator
|
||||||
from ..sessions.base_session_service import BaseSessionService
|
from ..sessions.base_session_service import BaseSessionService
|
||||||
from ..sessions.session import Session
|
|
||||||
from .utils import common
|
|
||||||
|
|
||||||
logger = logging.getLogger("google_adk." + __name__)
|
logger = logging.getLogger("google_adk." + __name__)
|
||||||
|
|
||||||
|
|
||||||
class EvalMetric(common.BaseModel):
|
|
||||||
"""A metric used to evaluate a particular aspect of an eval case."""
|
|
||||||
|
|
||||||
metric_name: str
|
|
||||||
"""The name of the metric."""
|
|
||||||
|
|
||||||
threshold: float
|
|
||||||
"""A threshold value. Each metric decides how to interpret this threshold."""
|
|
||||||
|
|
||||||
|
|
||||||
class EvalMetricResult(EvalMetric):
|
|
||||||
"""The actual computed score/value of a particular EvalMetric."""
|
|
||||||
|
|
||||||
score: Optional[float] = None
|
|
||||||
eval_status: EvalStatus
|
|
||||||
|
|
||||||
|
|
||||||
class EvalMetricResultPerInvocation(common.BaseModel):
|
|
||||||
"""Eval metric results per invocation."""
|
|
||||||
|
|
||||||
actual_invocation: Invocation
|
|
||||||
"""The actual invocation, usually obtained by inferencing the agent."""
|
|
||||||
|
|
||||||
expected_invocation: Invocation
|
|
||||||
"""The expected invocation, usually the reference or golden invocation."""
|
|
||||||
|
|
||||||
eval_metric_results: list[EvalMetricResult] = []
|
|
||||||
"""Eval resutls for each applicable metric."""
|
|
||||||
|
|
||||||
|
|
||||||
class EvalCaseResult(common.BaseModel):
|
|
||||||
"""Case-level evaluation results."""
|
|
||||||
|
|
||||||
eval_set_file: str = Field(
|
|
||||||
deprecated=True,
|
|
||||||
description="This field is deprecated, use eval_set_id instead.",
|
|
||||||
)
|
|
||||||
eval_set_id: str = ""
|
|
||||||
"""The eval set id."""
|
|
||||||
|
|
||||||
eval_id: str = ""
|
|
||||||
"""The eval case id."""
|
|
||||||
|
|
||||||
final_eval_status: EvalStatus
|
|
||||||
"""Final eval status for this eval case."""
|
|
||||||
|
|
||||||
eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]] = Field(
|
|
||||||
deprecated=True,
|
|
||||||
description=(
|
|
||||||
"This field is deprecated, use overall_eval_metric_results instead."
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
overall_eval_metric_results: list[EvalMetricResult]
|
|
||||||
"""Overall result for each metric for the entire eval case."""
|
|
||||||
|
|
||||||
eval_metric_result_per_invocation: list[EvalMetricResultPerInvocation]
|
|
||||||
"""Result for each metric on a per invocation basis."""
|
|
||||||
|
|
||||||
session_id: str
|
|
||||||
"""Session id of the session generated as result of inferencing/scraping stage of the eval."""
|
|
||||||
|
|
||||||
session_details: Optional[Session] = None
|
|
||||||
"""Session generated as result of inferencing/scraping stage of the eval."""
|
|
||||||
|
|
||||||
user_id: Optional[str] = None
|
|
||||||
"""User id used during inferencing/scraping stage of the eval."""
|
|
||||||
|
|
||||||
|
|
||||||
class EvalSetResult(common.BaseModel):
|
|
||||||
eval_set_result_id: str
|
|
||||||
eval_set_result_name: str
|
|
||||||
eval_set_id: str
|
|
||||||
eval_case_results: list[EvalCaseResult] = Field(default_factory=list)
|
|
||||||
creation_timestamp: float = 0.0
|
|
||||||
|
|
||||||
|
|
||||||
MISSING_EVAL_DEPENDENCIES_MESSAGE = (
|
MISSING_EVAL_DEPENDENCIES_MESSAGE = (
|
||||||
"Eval module is not installed, please install via `pip install"
|
"Eval module is not installed, please install via `pip install"
|
||||||
" google-adk[eval]`."
|
" google-adk[eval]`."
|
||||||
@ -227,8 +151,6 @@ async def run_evals(
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
from ..evaluation.agent_evaluator import EvaluationGenerator
|
from ..evaluation.agent_evaluator import EvaluationGenerator
|
||||||
from ..evaluation.response_evaluator import ResponseEvaluator
|
|
||||||
from ..evaluation.trajectory_evaluator import TrajectoryEvaluator
|
|
||||||
except ModuleNotFoundError as e:
|
except ModuleNotFoundError as e:
|
||||||
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
|
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
|
||||||
|
|
||||||
|
@ -13,6 +13,8 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
import importlib
|
import importlib
|
||||||
@ -59,6 +61,10 @@ from ..agents.run_config import StreamingMode
|
|||||||
from ..artifacts.in_memory_artifact_service import InMemoryArtifactService
|
from ..artifacts.in_memory_artifact_service import InMemoryArtifactService
|
||||||
from ..evaluation.eval_case import EvalCase
|
from ..evaluation.eval_case import EvalCase
|
||||||
from ..evaluation.eval_case import SessionInput
|
from ..evaluation.eval_case import SessionInput
|
||||||
|
from ..evaluation.eval_metrics import EvalMetric
|
||||||
|
from ..evaluation.eval_metrics import EvalMetricResult
|
||||||
|
from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
|
||||||
|
from ..evaluation.eval_result import EvalSetResult
|
||||||
from ..evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager
|
from ..evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager
|
||||||
from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
|
from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
|
||||||
from ..events.event import Event
|
from ..events.event import Event
|
||||||
@ -69,10 +75,6 @@ from ..sessions.in_memory_session_service import InMemorySessionService
|
|||||||
from ..sessions.session import Session
|
from ..sessions.session import Session
|
||||||
from ..sessions.vertex_ai_session_service import VertexAiSessionService
|
from ..sessions.vertex_ai_session_service import VertexAiSessionService
|
||||||
from .cli_eval import EVAL_SESSION_ID_PREFIX
|
from .cli_eval import EVAL_SESSION_ID_PREFIX
|
||||||
from .cli_eval import EvalMetric
|
|
||||||
from .cli_eval import EvalMetricResult
|
|
||||||
from .cli_eval import EvalMetricResultPerInvocation
|
|
||||||
from .cli_eval import EvalSetResult
|
|
||||||
from .cli_eval import EvalStatus
|
from .cli_eval import EvalStatus
|
||||||
from .utils import cleanup
|
from .utils import cleanup
|
||||||
from .utils import common
|
from .utils import common
|
||||||
|
72
src/google/adk/evaluation/eval_metrics.py
Normal file
72
src/google/adk/evaluation/eval_metrics.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
# Copyright 2025 Google LLC
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from pydantic import alias_generators
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from pydantic import ConfigDict
|
||||||
|
|
||||||
|
from .eval_case import Invocation
|
||||||
|
from .evaluator import EvalStatus
|
||||||
|
|
||||||
|
|
||||||
|
class EvalMetric(BaseModel):
|
||||||
|
"""A metric used to evaluate a particular aspect of an eval case."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
alias_generator=alias_generators.to_camel,
|
||||||
|
populate_by_name=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
alias_generator=alias_generators.to_camel,
|
||||||
|
populate_by_name=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
metric_name: str
|
||||||
|
"""The name of the metric."""
|
||||||
|
|
||||||
|
threshold: float
|
||||||
|
"""A threshold value. Each metric decides how to interpret this threshold."""
|
||||||
|
|
||||||
|
|
||||||
|
class EvalMetricResult(EvalMetric):
|
||||||
|
"""The actual computed score/value of a particular EvalMetric."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
alias_generator=alias_generators.to_camel,
|
||||||
|
populate_by_name=True,
|
||||||
|
)
|
||||||
|
model_config = ConfigDict(
|
||||||
|
alias_generator=alias_generators.to_camel,
|
||||||
|
populate_by_name=True,
|
||||||
|
)
|
||||||
|
score: Optional[float] = None
|
||||||
|
eval_status: EvalStatus
|
||||||
|
|
||||||
|
|
||||||
|
class EvalMetricResultPerInvocation(BaseModel):
|
||||||
|
"""Eval metric results per invocation."""
|
||||||
|
|
||||||
|
actual_invocation: Invocation
|
||||||
|
"""The actual invocation, usually obtained by inferencing the agent."""
|
||||||
|
|
||||||
|
expected_invocation: Invocation
|
||||||
|
"""The expected invocation, usually the reference or golden invocation."""
|
||||||
|
|
||||||
|
eval_metric_results: list[EvalMetricResult] = []
|
||||||
|
"""Eval resutls for each applicable metric."""
|
86
src/google/adk/evaluation/eval_result.py
Normal file
86
src/google/adk/evaluation/eval_result.py
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
# Copyright 2025 Google LLC
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from pydantic import alias_generators
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from pydantic import ConfigDict
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from ..sessions.session import Session
|
||||||
|
from .eval_metrics import EvalMetric
|
||||||
|
from .eval_metrics import EvalMetricResult
|
||||||
|
from .eval_metrics import EvalMetricResultPerInvocation
|
||||||
|
from .evaluator import EvalStatus
|
||||||
|
|
||||||
|
|
||||||
|
class EvalCaseResult(BaseModel):
|
||||||
|
"""Case level evaluation results."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
alias_generator=alias_generators.to_camel,
|
||||||
|
populate_by_name=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
eval_set_file: str = Field(
|
||||||
|
deprecated=True,
|
||||||
|
description="This field is deprecated, use eval_set_id instead.",
|
||||||
|
)
|
||||||
|
eval_set_id: str = ""
|
||||||
|
"""The eval set id."""
|
||||||
|
|
||||||
|
eval_id: str = ""
|
||||||
|
"""The eval case id."""
|
||||||
|
|
||||||
|
final_eval_status: EvalStatus
|
||||||
|
"""Final eval status for this eval case."""
|
||||||
|
|
||||||
|
eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]] = Field(
|
||||||
|
deprecated=True,
|
||||||
|
description=(
|
||||||
|
"This field is deprecated, use overall_eval_metric_results instead."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
overall_eval_metric_results: list[EvalMetricResult]
|
||||||
|
"""Overall result for each metric for the entire eval case."""
|
||||||
|
|
||||||
|
eval_metric_result_per_invocation: list[EvalMetricResultPerInvocation]
|
||||||
|
"""Result for each metric on a per invocation basis."""
|
||||||
|
|
||||||
|
session_id: str
|
||||||
|
"""Session id of the session generated as result of inferencing/scraping stage of the eval."""
|
||||||
|
|
||||||
|
session_details: Optional[Session] = None
|
||||||
|
"""Session generated as result of inferencing/scraping stage of the eval."""
|
||||||
|
|
||||||
|
user_id: Optional[str] = None
|
||||||
|
"""User id used during inferencing/scraping stage of the eval."""
|
||||||
|
|
||||||
|
|
||||||
|
class EvalSetResult(BaseModel):
|
||||||
|
"""Eval set level evaluation results."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
alias_generator=alias_generators.to_camel,
|
||||||
|
populate_by_name=True,
|
||||||
|
)
|
||||||
|
eval_set_result_id: str
|
||||||
|
eval_set_result_name: str
|
||||||
|
eval_set_id: str
|
||||||
|
eval_case_results: list[EvalCaseResult] = Field(default_factory=list)
|
||||||
|
creation_timestamp: float = 0.0
|
@ -12,11 +12,13 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from abc import ABC
|
from abc import ABC
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
|
|
||||||
from ..cli.cli_eval import EvalCaseResult
|
from .eval_result import EvalCaseResult
|
||||||
from ..cli.cli_eval import EvalSetResult
|
from .eval_result import EvalSetResult
|
||||||
|
|
||||||
|
|
||||||
class EvalSetResultsManager(ABC):
|
class EvalSetResultsManager(ABC):
|
||||||
|
@ -12,6 +12,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@ -19,8 +21,8 @@ import time
|
|||||||
|
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
from ..cli.cli_eval import EvalCaseResult
|
from .eval_result import EvalCaseResult
|
||||||
from ..cli.cli_eval import EvalSetResult
|
from .eval_result import EvalSetResult
|
||||||
from .eval_set_results_manager import EvalSetResultsManager
|
from .eval_set_results_manager import EvalSetResultsManager
|
||||||
|
|
||||||
logger = logging.getLogger("google_adk." + __name__)
|
logger = logging.getLogger("google_adk." + __name__)
|
||||||
|
Loading…
Reference in New Issue
Block a user