mirror of
https://github.com/EvolutionAPI/adk-python.git
synced 2025-07-14 01:41:25 -06:00
Update AgentEvaluator to new new EvalSchema
PiperOrigin-RevId: 759293759
This commit is contained in:
parent
bdd678db31
commit
4c6820e78c
@ -17,10 +17,8 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import AsyncGenerator
|
from typing import AsyncGenerator
|
||||||
from typing import cast
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
@ -350,7 +348,7 @@ def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
|
|||||||
return TrajectoryEvaluator(threshold=eval_metric.threshold)
|
return TrajectoryEvaluator(threshold=eval_metric.threshold)
|
||||||
elif (
|
elif (
|
||||||
eval_metric.metric_name == RESPONSE_MATCH_SCORE_KEY
|
eval_metric.metric_name == RESPONSE_MATCH_SCORE_KEY
|
||||||
or eval_metric == RESPONSE_EVALUATION_SCORE_KEY
|
or eval_metric.metric_name == RESPONSE_EVALUATION_SCORE_KEY
|
||||||
):
|
):
|
||||||
return ResponseEvaluator(
|
return ResponseEvaluator(
|
||||||
threshold=eval_metric.threshold, metric_name=eval_metric.metric_name
|
threshold=eval_metric.threshold, metric_name=eval_metric.metric_name
|
||||||
|
@ -18,8 +18,13 @@ from os import path
|
|||||||
from typing import Dict
|
from typing import Dict
|
||||||
from typing import List
|
from typing import List
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
import uuid
|
||||||
|
from .eval_set import EvalSet
|
||||||
from .evaluation_generator import EvaluationGenerator
|
from .evaluation_generator import EvaluationGenerator
|
||||||
|
from .evaluator import EvalStatus
|
||||||
|
from .evaluator import EvaluationResult
|
||||||
|
from .evaluator import Evaluator
|
||||||
|
from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
|
||||||
from .response_evaluator import ResponseEvaluator
|
from .response_evaluator import ResponseEvaluator
|
||||||
from .trajectory_evaluator import TrajectoryEvaluator
|
from .trajectory_evaluator import TrajectoryEvaluator
|
||||||
|
|
||||||
@ -75,6 +80,62 @@ class AgentEvaluator:
|
|||||||
)
|
)
|
||||||
return DEFAULT_CRITERIA
|
return DEFAULT_CRITERIA
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def evaluate_eval_set(
|
||||||
|
agent_module: str,
|
||||||
|
eval_set: EvalSet,
|
||||||
|
criteria: dict[str, float],
|
||||||
|
num_runs=NUM_RUNS,
|
||||||
|
agent_name=None,
|
||||||
|
):
|
||||||
|
"""Evaluates an agent using the given EvalSet.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
agent_module: The path to python module that contains the definition of
|
||||||
|
the agent. There is convention in place here, where the code is going to
|
||||||
|
look for 'root_agent' in the loaded module.
|
||||||
|
eval_set: The eval set.
|
||||||
|
criteria: Evauation criterias, a dictionary of metric names to their
|
||||||
|
respective thresholds.
|
||||||
|
num_runs: Number of times all entries in the eval dataset should be
|
||||||
|
assessed.
|
||||||
|
agent_name: The name of the agent.
|
||||||
|
"""
|
||||||
|
eval_case_responses_list = await EvaluationGenerator.generate_responses(
|
||||||
|
eval_set=eval_set,
|
||||||
|
agent_module_path=agent_module,
|
||||||
|
repeat_num=num_runs,
|
||||||
|
agent_name=agent_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
for eval_case_responses in eval_case_responses_list:
|
||||||
|
actual_invocations = [
|
||||||
|
invocation
|
||||||
|
for invocations in eval_case_responses.responses
|
||||||
|
for invocation in invocations
|
||||||
|
]
|
||||||
|
expected_invocations = (
|
||||||
|
eval_case_responses.eval_case.conversation * num_runs
|
||||||
|
)
|
||||||
|
|
||||||
|
for metric_name, threshold in criteria.items():
|
||||||
|
metric_evaluator = AgentEvaluator._get_metric_evaluator(
|
||||||
|
metric_name=metric_name, threshold=threshold
|
||||||
|
)
|
||||||
|
|
||||||
|
evaluation_result: EvaluationResult = (
|
||||||
|
metric_evaluator.evaluate_invocations(
|
||||||
|
actual_invocations=actual_invocations,
|
||||||
|
expected_invocations=expected_invocations,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert evaluation_result.overall_eval_status == EvalStatus.PASSED, (
|
||||||
|
f"`{eval_case_responses.eval_case.eval_id}`: "
|
||||||
|
f"{metric_name} for {agent_module} Failed. Expected {threshold},"
|
||||||
|
f" but got {evaluation_result.overall_score}."
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def evaluate(
|
async def evaluate(
|
||||||
agent_module,
|
agent_module,
|
||||||
@ -109,34 +170,32 @@ class AgentEvaluator:
|
|||||||
else:
|
else:
|
||||||
test_files = [eval_dataset_file_path_or_dir]
|
test_files = [eval_dataset_file_path_or_dir]
|
||||||
|
|
||||||
initial_session_state = {}
|
initial_session = {}
|
||||||
if initial_session_file:
|
if initial_session_file:
|
||||||
with open(initial_session_file, "r") as f:
|
with open(initial_session_file, "r") as f:
|
||||||
initial_session_state = json.loads(f.read())["state"]
|
initial_session = json.loads(f.read())
|
||||||
|
|
||||||
for test_file in test_files:
|
for test_file in test_files:
|
||||||
dataset = AgentEvaluator._load_dataset(test_file)[0]
|
data = AgentEvaluator._load_dataset(test_file)[0]
|
||||||
criteria = AgentEvaluator.find_config_for_test_file(test_file)
|
criteria = AgentEvaluator.find_config_for_test_file(test_file)
|
||||||
|
AgentEvaluator._validate_input([data], criteria)
|
||||||
|
|
||||||
AgentEvaluator._validate_input([dataset], criteria)
|
eval_data = {
|
||||||
|
"name": test_file,
|
||||||
|
"data": data,
|
||||||
|
"initial_session": initial_session,
|
||||||
|
}
|
||||||
|
|
||||||
evaluation_response = await AgentEvaluator._generate_responses(
|
eval_set = convert_eval_set_to_pydanctic_schema(
|
||||||
agent_module,
|
eval_set_id=str(uuid.uuid4()), eval_set_in_json_format=[eval_data]
|
||||||
[dataset],
|
)
|
||||||
num_runs,
|
await AgentEvaluator.evaluate_eval_set(
|
||||||
agent_name=agent_name,
|
agent_module=agent_module,
|
||||||
initial_session={"state": initial_session_state},
|
eval_set=eval_set,
|
||||||
|
criteria=criteria,
|
||||||
|
num_runs=num_runs,
|
||||||
|
agent_name=agent_name,
|
||||||
)
|
)
|
||||||
|
|
||||||
if AgentEvaluator._response_evaluation_required(criteria, [dataset]):
|
|
||||||
AgentEvaluator._evaluate_response_scores(
|
|
||||||
agent_module, evaluation_response, criteria
|
|
||||||
)
|
|
||||||
|
|
||||||
if AgentEvaluator._trajectory_evaluation_required(criteria, [dataset]):
|
|
||||||
AgentEvaluator._evaluate_tool_trajectory(
|
|
||||||
agent_module, evaluation_response, criteria
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _load_dataset(
|
def _load_dataset(
|
||||||
@ -221,102 +280,13 @@ class AgentEvaluator:
|
|||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_infer_criteria(eval_dataset):
|
def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
|
||||||
"""Infers evaluation criteria based on the provided dataset.
|
if metric_name == TOOL_TRAJECTORY_SCORE_KEY:
|
||||||
|
return TrajectoryEvaluator(threshold=threshold)
|
||||||
|
elif (
|
||||||
|
metric_name == RESPONSE_MATCH_SCORE_KEY
|
||||||
|
or metric_name == RESPONSE_EVALUATION_SCORE_KEY
|
||||||
|
):
|
||||||
|
return ResponseEvaluator(threshold=threshold, metric_name=metric_name)
|
||||||
|
|
||||||
Args:
|
raise ValueError(f"Unsupported eval metric: {metric_name}")
|
||||||
eval_dataset (list): A list of evaluation samples.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: Inferred evaluation criteria based on dataset fields.
|
|
||||||
"""
|
|
||||||
inferred_criteria = {}
|
|
||||||
sample = eval_dataset[0][0]
|
|
||||||
|
|
||||||
if QUERY_COLUMN in sample and EXPECTED_TOOL_USE_COLUMN in sample:
|
|
||||||
inferred_criteria[TOOL_TRAJECTORY_SCORE_KEY] = DEFAULT_CRITERIA[
|
|
||||||
TOOL_TRAJECTORY_SCORE_KEY
|
|
||||||
]
|
|
||||||
|
|
||||||
if QUERY_COLUMN in sample and REFERENCE_COLUMN in sample:
|
|
||||||
inferred_criteria[RESPONSE_MATCH_SCORE_KEY] = DEFAULT_CRITERIA[
|
|
||||||
RESPONSE_MATCH_SCORE_KEY
|
|
||||||
]
|
|
||||||
|
|
||||||
return inferred_criteria
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
async def _generate_responses(
|
|
||||||
agent_module, eval_dataset, num_runs, agent_name=None, initial_session={}
|
|
||||||
):
|
|
||||||
"""Generates evaluation responses by running the agent module multiple times."""
|
|
||||||
return EvaluationGenerator.generate_responses(
|
|
||||||
eval_dataset,
|
|
||||||
agent_module,
|
|
||||||
repeat_num=num_runs,
|
|
||||||
agent_name=agent_name,
|
|
||||||
initial_session=initial_session,
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _response_evaluation_required(criteria, eval_dataset):
|
|
||||||
"""Checks if response evaluation are needed."""
|
|
||||||
return REFERENCE_COLUMN in eval_dataset[0][0] and any(
|
|
||||||
key in criteria
|
|
||||||
for key in [RESPONSE_EVALUATION_SCORE_KEY, RESPONSE_MATCH_SCORE_KEY]
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _trajectory_evaluation_required(evaluation_criteria, eval_dataset):
|
|
||||||
"""Checks if response evaluation are needed."""
|
|
||||||
return (
|
|
||||||
EXPECTED_TOOL_USE_COLUMN in eval_dataset[0][0]
|
|
||||||
and TOOL_TRAJECTORY_SCORE_KEY in evaluation_criteria
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _evaluate_response_scores(agent_module, evaluation_response, criteria):
|
|
||||||
"""Evaluates response scores and raises an assertion error if they don't meet the criteria."""
|
|
||||||
metrics = ResponseEvaluator.evaluate(
|
|
||||||
evaluation_response, criteria, print_detailed_results=True
|
|
||||||
)
|
|
||||||
|
|
||||||
AgentEvaluator._assert_score(
|
|
||||||
metrics,
|
|
||||||
"coherence/mean",
|
|
||||||
criteria.get(RESPONSE_EVALUATION_SCORE_KEY),
|
|
||||||
"Average response evaluation score",
|
|
||||||
agent_module,
|
|
||||||
)
|
|
||||||
|
|
||||||
AgentEvaluator._assert_score(
|
|
||||||
metrics,
|
|
||||||
"rouge_1/mean",
|
|
||||||
criteria.get(RESPONSE_MATCH_SCORE_KEY),
|
|
||||||
"Average response match score",
|
|
||||||
agent_module,
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _evaluate_tool_trajectory(agent_module, evaluation_response, criteria):
|
|
||||||
"""Evaluates tool trajectory scores and raises an assertion error if they don't meet the criteria."""
|
|
||||||
score = TrajectoryEvaluator.evaluate(
|
|
||||||
evaluation_response, print_detailed_results=True
|
|
||||||
)
|
|
||||||
AgentEvaluator._assert_score(
|
|
||||||
{TOOL_TRAJECTORY_SCORE_KEY: score},
|
|
||||||
TOOL_TRAJECTORY_SCORE_KEY,
|
|
||||||
criteria[TOOL_TRAJECTORY_SCORE_KEY],
|
|
||||||
"Average tool trajectory evaluation score",
|
|
||||||
agent_module,
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _assert_score(metrics, metric_key, threshold, description, agent_module):
|
|
||||||
"""Asserts that a metric meets the specified threshold."""
|
|
||||||
if metric_key in metrics:
|
|
||||||
actual_score = metrics[metric_key]
|
|
||||||
assert actual_score >= threshold, (
|
|
||||||
f"{description} for {agent_module} is lower than expected. "
|
|
||||||
f"Expected >= {threshold}, but got {actual_score}."
|
|
||||||
)
|
|
||||||
|
@ -13,9 +13,12 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import importlib
|
import importlib
|
||||||
from typing import Any, Optional
|
from typing import Any
|
||||||
|
from typing import Optional
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from ..agents.llm_agent import Agent
|
from ..agents.llm_agent import Agent
|
||||||
from ..artifacts.base_artifact_service import BaseArtifactService
|
from ..artifacts.base_artifact_service import BaseArtifactService
|
||||||
from ..artifacts.in_memory_artifact_service import InMemoryArtifactService
|
from ..artifacts.in_memory_artifact_service import InMemoryArtifactService
|
||||||
@ -23,9 +26,21 @@ from ..runners import Runner
|
|||||||
from ..sessions.base_session_service import BaseSessionService
|
from ..sessions.base_session_service import BaseSessionService
|
||||||
from ..sessions.in_memory_session_service import InMemorySessionService
|
from ..sessions.in_memory_session_service import InMemorySessionService
|
||||||
from ..sessions.session import Session
|
from ..sessions.session import Session
|
||||||
|
from .eval_case import EvalCase
|
||||||
from .eval_case import IntermediateData
|
from .eval_case import IntermediateData
|
||||||
from .eval_case import Invocation
|
from .eval_case import Invocation
|
||||||
from .eval_case import SessionInput
|
from .eval_case import SessionInput
|
||||||
|
from .eval_set import EvalSet
|
||||||
|
|
||||||
|
|
||||||
|
class EvalCaseResponses(BaseModel):
|
||||||
|
"""Contains multiple responses associated with an EvalCase.
|
||||||
|
|
||||||
|
Multiple responses are a result of repeated requests to genereate inferences.
|
||||||
|
"""
|
||||||
|
|
||||||
|
eval_case: EvalCase
|
||||||
|
responses: list[list[Invocation]]
|
||||||
|
|
||||||
|
|
||||||
class EvaluationGenerator:
|
class EvaluationGenerator:
|
||||||
@ -33,12 +48,11 @@ class EvaluationGenerator:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def generate_responses(
|
async def generate_responses(
|
||||||
eval_dataset,
|
eval_set: EvalSet,
|
||||||
agent_module_path,
|
agent_module_path: str,
|
||||||
repeat_num=3,
|
repeat_num: int = 3,
|
||||||
agent_name=None,
|
agent_name: str = None,
|
||||||
initial_session={},
|
) -> list[EvalCaseResponses]:
|
||||||
):
|
|
||||||
"""Returns evaluation responses for the given dataset and agent.
|
"""Returns evaluation responses for the given dataset and agent.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -48,17 +62,23 @@ class EvaluationGenerator:
|
|||||||
usually done to remove uncertainty that a single run may bring.
|
usually done to remove uncertainty that a single run may bring.
|
||||||
agent_name: The name of the agent that should be evaluated. This is
|
agent_name: The name of the agent that should be evaluated. This is
|
||||||
usually the sub-agent.
|
usually the sub-agent.
|
||||||
initial_session: Initial session for the eval data.
|
|
||||||
"""
|
"""
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
for _ in range(repeat_num):
|
for eval_case in eval_set.eval_cases:
|
||||||
for data in eval_dataset:
|
responses = []
|
||||||
results.append(
|
for _ in range(repeat_num):
|
||||||
EvaluationGenerator._process_query(
|
response_invocations = await EvaluationGenerator._process_query(
|
||||||
data, agent_module_path, agent_name, initial_session
|
eval_case.conversation,
|
||||||
)
|
agent_module_path,
|
||||||
|
agent_name,
|
||||||
|
eval_case.session_input,
|
||||||
)
|
)
|
||||||
|
responses.append(response_invocations)
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
EvalCaseResponses(eval_case=eval_case, responses=responses)
|
||||||
|
)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@ -89,7 +109,12 @@ class EvaluationGenerator:
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _process_query(data, module_name, agent_name=None, initial_session={}):
|
async def _process_query(
|
||||||
|
invocations: list[Invocation],
|
||||||
|
module_name: str,
|
||||||
|
agent_name: Optional[str] = None,
|
||||||
|
initial_session: Optional[SessionInput] = None,
|
||||||
|
) -> list[Invocation]:
|
||||||
"""Process a query using the agent and evaluation dataset."""
|
"""Process a query using the agent and evaluation dataset."""
|
||||||
module_path = f"{module_name}"
|
module_path = f"{module_name}"
|
||||||
agent_module = importlib.import_module(module_path)
|
agent_module = importlib.import_module(module_path)
|
||||||
@ -102,8 +127,8 @@ class EvaluationGenerator:
|
|||||||
agent_to_evaluate = root_agent.find_agent(agent_name)
|
agent_to_evaluate = root_agent.find_agent(agent_name)
|
||||||
assert agent_to_evaluate, f"Sub-Agent `{agent_name}` not found."
|
assert agent_to_evaluate, f"Sub-Agent `{agent_name}` not found."
|
||||||
|
|
||||||
return EvaluationGenerator._generate_inferences_from_root_agent(
|
return await EvaluationGenerator._generate_inferences_from_root_agent(
|
||||||
data, agent_to_evaluate, reset_func, initial_session
|
invocations, agent_to_evaluate, reset_func, initial_session
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -216,3 +241,5 @@ class EvaluationGenerator:
|
|||||||
responses[index]["actual_tool_use"] = actual_tool_uses
|
responses[index]["actual_tool_use"] = actual_tool_uses
|
||||||
responses[index]["response"] = response
|
responses[index]["response"] = response
|
||||||
return responses
|
return responses
|
||||||
|
return responses
|
||||||
|
return responses
|
||||||
|
@ -43,16 +43,16 @@ def _convert_invocation_to_pydantic_schema(
|
|||||||
expected_tool_use = []
|
expected_tool_use = []
|
||||||
expected_intermediate_agent_responses = []
|
expected_intermediate_agent_responses = []
|
||||||
|
|
||||||
for old_tool_use in invocation_in_json_format["expected_tool_use"]:
|
for old_tool_use in invocation_in_json_format.get("expected_tool_use", []):
|
||||||
expected_tool_use.append(
|
expected_tool_use.append(
|
||||||
genai_types.FunctionCall(
|
genai_types.FunctionCall(
|
||||||
name=old_tool_use["tool_name"], args=old_tool_use["tool_input"]
|
name=old_tool_use["tool_name"], args=old_tool_use["tool_input"]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
for old_intermediate_response in invocation_in_json_format[
|
for old_intermediate_response in invocation_in_json_format.get(
|
||||||
"expected_intermediate_agent_responses"
|
"expected_intermediate_agent_responses", []
|
||||||
]:
|
):
|
||||||
expected_intermediate_agent_responses.append((
|
expected_intermediate_agent_responses.append((
|
||||||
old_intermediate_response["author"],
|
old_intermediate_response["author"],
|
||||||
[genai_types.Part.from_text(text=old_intermediate_response["text"])],
|
[genai_types.Part.from_text(text=old_intermediate_response["text"])],
|
||||||
@ -134,14 +134,18 @@ def convert_eval_set_to_pydanctic_schema(
|
|||||||
_convert_invocation_to_pydantic_schema(old_invocation)
|
_convert_invocation_to_pydantic_schema(old_invocation)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
session_input = None
|
||||||
|
if "initial_session" in old_eval_case:
|
||||||
|
session_input = SessionInput(
|
||||||
|
app_name=old_eval_case["initial_session"].get("app_name", ""),
|
||||||
|
user_id=old_eval_case["initial_session"].get("user_id", ""),
|
||||||
|
state=old_eval_case["initial_session"].get("state", {}),
|
||||||
|
)
|
||||||
|
|
||||||
new_eval_case = EvalCase(
|
new_eval_case = EvalCase(
|
||||||
eval_id=old_eval_case["name"],
|
eval_id=old_eval_case["name"],
|
||||||
conversation=new_invocations,
|
conversation=new_invocations,
|
||||||
session_input=SessionInput(
|
session_input=session_input,
|
||||||
app_name=old_eval_case["initial_session"]["app_name"],
|
|
||||||
user_id=old_eval_case["initial_session"]["user_id"],
|
|
||||||
state=old_eval_case["initial_session"]["state"],
|
|
||||||
),
|
|
||||||
creation_timestamp=time.time(),
|
creation_timestamp=time.time(),
|
||||||
)
|
)
|
||||||
eval_cases.append(new_eval_case)
|
eval_cases.append(new_eval_case)
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from google.adk.evaluation import AgentEvaluator
|
from google.adk.evaluation import AgentEvaluator
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from google.adk.evaluation import AgentEvaluator
|
from google.adk.evaluation import AgentEvaluator
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
@ -13,11 +13,12 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from google.adk.evaluation import AgentEvaluator
|
from google.adk.evaluation import AgentEvaluator
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_eval_agent():
|
async def test_eval_agent():
|
||||||
AgentEvaluator.evaluate(
|
await AgentEvaluator.evaluate(
|
||||||
agent_module="tests.integration.fixture.home_automation_agent",
|
agent_module="tests.integration.fixture.home_automation_agent",
|
||||||
eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/simple_test.test.json",
|
eval_dataset_file_path_or_dir="tests/integration/fixture/home_automation_agent/simple_test.test.json",
|
||||||
num_runs=4,
|
num_runs=4,
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from google.adk.evaluation import AgentEvaluator
|
from google.adk.evaluation import AgentEvaluator
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from google.adk.evaluation import AgentEvaluator
|
from google.adk.evaluation import AgentEvaluator
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
Loading…
Reference in New Issue
Block a user