mirror of
https://github.com/EvolutionAPI/adk-python.git
synced 2025-07-14 09:51:25 -06:00
Update Eval Set Manager to use new EvalSet and EvalCase schema.
PiperOrigin-RevId: 758921365
This commit is contained in:
parent
293f406148
commit
2cb74dd20e
@ -60,6 +60,8 @@ from ..agents.llm_agent import Agent
|
|||||||
from ..agents.llm_agent import LlmAgent
|
from ..agents.llm_agent import LlmAgent
|
||||||
from ..agents.run_config import StreamingMode
|
from ..agents.run_config import StreamingMode
|
||||||
from ..artifacts import InMemoryArtifactService
|
from ..artifacts import InMemoryArtifactService
|
||||||
|
from ..evaluation.eval_case import EvalCase
|
||||||
|
from ..evaluation.eval_case import SessionInput
|
||||||
from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
|
from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
|
||||||
from ..events.event import Event
|
from ..events.event import Event
|
||||||
from ..memory.in_memory_memory_service import InMemoryMemoryService
|
from ..memory.in_memory_memory_service import InMemoryMemoryService
|
||||||
@ -436,25 +438,25 @@ def get_fast_api_app(
|
|||||||
)
|
)
|
||||||
assert session, "Session not found."
|
assert session, "Session not found."
|
||||||
|
|
||||||
# Convert the session data to evaluation format
|
# Convert the session data to eval invocations
|
||||||
test_data = evals.convert_session_to_eval_format(session)
|
invocations = evals.convert_session_to_eval_invocations(session)
|
||||||
|
|
||||||
# Populate the session with initial session state.
|
# Populate the session with initial session state.
|
||||||
initial_session_state = create_empty_state(
|
initial_session_state = create_empty_state(
|
||||||
await _get_root_agent_async(app_name)
|
await _get_root_agent_async(app_name)
|
||||||
)
|
)
|
||||||
|
|
||||||
eval_case = {
|
new_eval_case = EvalCase(
|
||||||
"name": req.eval_id,
|
eval_id=req.eval_id,
|
||||||
"data": test_data,
|
conversation=invocations,
|
||||||
"initial_session": {
|
session_input=SessionInput(
|
||||||
"state": initial_session_state,
|
app_name=app_name, user_id=req.user_id, state=initial_session_state
|
||||||
"app_name": app_name,
|
),
|
||||||
"user_id": req.user_id,
|
creation_timestamp=time.time(),
|
||||||
},
|
)
|
||||||
}
|
|
||||||
try:
|
try:
|
||||||
eval_sets_manager.add_eval_case(app_name, eval_set_id, eval_case)
|
eval_sets_manager.add_eval_case(app_name, eval_set_id, new_eval_case)
|
||||||
except ValueError as ve:
|
except ValueError as ve:
|
||||||
raise HTTPException(status_code=400, detail=str(ve)) from ve
|
raise HTTPException(status_code=400, detail=str(ve)) from ve
|
||||||
|
|
||||||
@ -469,7 +471,7 @@ def get_fast_api_app(
|
|||||||
"""Lists all evals in an eval set."""
|
"""Lists all evals in an eval set."""
|
||||||
eval_set_data = eval_sets_manager.get_eval_set(app_name, eval_set_id)
|
eval_set_data = eval_sets_manager.get_eval_set(app_name, eval_set_id)
|
||||||
|
|
||||||
return sorted([x["name"] for x in eval_set_data])
|
return sorted([x.eval_id for x in eval_set_data.eval_cases])
|
||||||
|
|
||||||
@app.post(
|
@app.post(
|
||||||
"/apps/{app_name}/eval_sets/{eval_set_id}/run_eval",
|
"/apps/{app_name}/eval_sets/{eval_set_id}/run_eval",
|
||||||
|
@ -47,10 +47,10 @@ class Invocation(BaseModel):
|
|||||||
user_content: genai_types.Content
|
user_content: genai_types.Content
|
||||||
"""Content provided by the user in this invocation."""
|
"""Content provided by the user in this invocation."""
|
||||||
|
|
||||||
final_response: Optional[genai_types.Content]
|
final_response: Optional[genai_types.Content] = None
|
||||||
"""Final response from the agent that acts a reference or benchmark."""
|
"""Final response from the agent that acts a reference or benchmark."""
|
||||||
|
|
||||||
intermediate_data: IntermediateData
|
intermediate_data: Optional[IntermediateData] = None
|
||||||
"""Reference intermediate steps generated as a part of Agent execution.
|
"""Reference intermediate steps generated as a part of Agent execution.
|
||||||
|
|
||||||
For a multi-agent system, it is also helpful to inspect the route that
|
For a multi-agent system, it is also helpful to inspect the route that
|
||||||
@ -83,7 +83,7 @@ class EvalCase(BaseModel):
|
|||||||
conversation: list[Invocation]
|
conversation: list[Invocation]
|
||||||
"""A conversation between the user and the Agent. The conversation can have any number of invocations."""
|
"""A conversation between the user and the Agent. The conversation can have any number of invocations."""
|
||||||
|
|
||||||
session_input: SessionInput
|
session_input: Optional[SessionInput] = None
|
||||||
"""Session input that will be passed on to the Agent during eval.
|
"""Session input that will be passed on to the Agent during eval.
|
||||||
It is common for Agents state to be initialized to some initial/default value,
|
It is common for Agents state to be initialized to some initial/default value,
|
||||||
for example, your agent may need to know today's date.
|
for example, your agent may need to know today's date.
|
||||||
|
@ -23,10 +23,10 @@ class EvalSet(BaseModel):
|
|||||||
eval_set_id: str
|
eval_set_id: str
|
||||||
"""Unique identifier for the eval set."""
|
"""Unique identifier for the eval set."""
|
||||||
|
|
||||||
name: Optional[str]
|
name: Optional[str] = None
|
||||||
"""Name of the dataset."""
|
"""Name of the dataset."""
|
||||||
|
|
||||||
description: Optional[str]
|
description: Optional[str] = None
|
||||||
"""Description of the dataset."""
|
"""Description of the dataset."""
|
||||||
|
|
||||||
eval_cases: list[EvalCase]
|
eval_cases: list[EvalCase]
|
||||||
|
@ -13,14 +13,16 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any
|
|
||||||
|
from .eval_case import EvalCase
|
||||||
|
from .eval_set import EvalSet
|
||||||
|
|
||||||
|
|
||||||
class EvalSetsManager(ABC):
|
class EvalSetsManager(ABC):
|
||||||
"""An interface to manage an Eval Sets."""
|
"""An interface to manage an Eval Sets."""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_eval_set(self, app_name: str, eval_set_id: str) -> Any:
|
def get_eval_set(self, app_name: str, eval_set_id: str) -> EvalSet:
|
||||||
"""Returns an EvalSet identified by an app_name and eval_set_id."""
|
"""Returns an EvalSet identified by an app_name and eval_set_id."""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
@ -35,6 +37,6 @@ class EvalSetsManager(ABC):
|
|||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: Any):
|
def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: EvalCase):
|
||||||
"""Adds the given EvalCase to an existing EvalSet identified by app_name and eval_set_id."""
|
"""Adds the given EvalCase to an existing EvalSet identified by app_name and eval_set_id."""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
@ -16,8 +16,17 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
import uuid
|
||||||
|
from google.genai import types as genai_types
|
||||||
|
from pydantic import ValidationError
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
from .eval_case import EvalCase
|
||||||
|
from .eval_case import IntermediateData
|
||||||
|
from .eval_case import Invocation
|
||||||
|
from .eval_case import SessionInput
|
||||||
|
from .eval_set import EvalSet
|
||||||
from .eval_sets_manager import EvalSetsManager
|
from .eval_sets_manager import EvalSetsManager
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -25,6 +34,126 @@ logger = logging.getLogger(__name__)
|
|||||||
_EVAL_SET_FILE_EXTENSION = ".evalset.json"
|
_EVAL_SET_FILE_EXTENSION = ".evalset.json"
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_invocation_to_pydantic_schema(
|
||||||
|
invocation_in_json_format: dict[str, Any],
|
||||||
|
) -> Invocation:
|
||||||
|
"""Converts an invocation from old json format to new Pydantic Schema"""
|
||||||
|
query = invocation_in_json_format["query"]
|
||||||
|
reference = invocation_in_json_format["reference"]
|
||||||
|
expected_tool_use = []
|
||||||
|
expected_intermediate_agent_responses = []
|
||||||
|
|
||||||
|
for old_tool_use in invocation_in_json_format["expected_tool_use"]:
|
||||||
|
expected_tool_use.append(
|
||||||
|
genai_types.FunctionCall(
|
||||||
|
name=old_tool_use["tool_name"], args=old_tool_use["tool_input"]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
for old_intermediate_response in invocation_in_json_format[
|
||||||
|
"expected_intermediate_agent_responses"
|
||||||
|
]:
|
||||||
|
expected_intermediate_agent_responses.append((
|
||||||
|
old_intermediate_response["author"],
|
||||||
|
[genai_types.Part.from_text(text=old_intermediate_response["text"])],
|
||||||
|
))
|
||||||
|
|
||||||
|
return Invocation(
|
||||||
|
invocation_id=str(uuid.uuid4()),
|
||||||
|
user_content=genai_types.Content(
|
||||||
|
parts=[genai_types.Part.from_text(text=query)], role="user"
|
||||||
|
),
|
||||||
|
final_response=genai_types.Content(
|
||||||
|
parts=[genai_types.Part.from_text(text=reference)], role="model"
|
||||||
|
),
|
||||||
|
intermediate_data=IntermediateData(
|
||||||
|
tool_uses=expected_tool_use,
|
||||||
|
intermediate_responses=expected_intermediate_agent_responses,
|
||||||
|
),
|
||||||
|
creation_timestamp=time.time(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_eval_set_to_pydanctic_schema(
|
||||||
|
eval_set_id: str,
|
||||||
|
eval_set_in_json_format: list[dict[str, Any]],
|
||||||
|
) -> EvalSet:
|
||||||
|
r"""Returns an pydantic EvalSet generated from the json representation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
eval_set_id: Eval set id.
|
||||||
|
eval_set_in_json_format: Eval set specified in JSON format.
|
||||||
|
|
||||||
|
Here is a sample eval set in JSON format:
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"name": "roll_17_sided_dice_twice",
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"query": "What can you do?",
|
||||||
|
"expected_tool_use": [],
|
||||||
|
"expected_intermediate_agent_responses": [],
|
||||||
|
"reference": "I can roll dice of different sizes and check if a number
|
||||||
|
is prime. I can also use multiple tools in parallel.\n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "Roll a 17 sided dice twice for me",
|
||||||
|
"expected_tool_use": [
|
||||||
|
{
|
||||||
|
"tool_name": "roll_die",
|
||||||
|
"tool_input": {
|
||||||
|
"sides": 17
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"tool_name": "roll_die",
|
||||||
|
"tool_input": {
|
||||||
|
"sides": 17
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"expected_intermediate_agent_responses": [],
|
||||||
|
"reference": "I have rolled a 17 sided die twice. The first roll was
|
||||||
|
13 and the second roll was 4.\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"initial_session": {
|
||||||
|
"state": {},
|
||||||
|
"app_name": "hello_world",
|
||||||
|
"user_id": "user"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
eval_cases = []
|
||||||
|
for old_eval_case in eval_set_in_json_format:
|
||||||
|
new_invocations = []
|
||||||
|
|
||||||
|
for old_invocation in old_eval_case["data"]:
|
||||||
|
new_invocations.append(
|
||||||
|
_convert_invocation_to_pydantic_schema(old_invocation)
|
||||||
|
)
|
||||||
|
|
||||||
|
new_eval_case = EvalCase(
|
||||||
|
eval_id=old_eval_case["name"],
|
||||||
|
conversation=new_invocations,
|
||||||
|
session_input=SessionInput(
|
||||||
|
app_name=old_eval_case["initial_session"]["app_name"],
|
||||||
|
user_id=old_eval_case["initial_session"]["user_id"],
|
||||||
|
state=old_eval_case["initial_session"]["state"],
|
||||||
|
),
|
||||||
|
creation_timestamp=time.time(),
|
||||||
|
)
|
||||||
|
eval_cases.append(new_eval_case)
|
||||||
|
|
||||||
|
return EvalSet(
|
||||||
|
eval_set_id=eval_set_id,
|
||||||
|
name=eval_set_id,
|
||||||
|
creation_timestamp=time.time(),
|
||||||
|
eval_cases=eval_cases,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class LocalEvalSetsManager(EvalSetsManager):
|
class LocalEvalSetsManager(EvalSetsManager):
|
||||||
"""An EvalSets manager that stores eval sets locally on disk."""
|
"""An EvalSets manager that stores eval sets locally on disk."""
|
||||||
|
|
||||||
@ -32,12 +161,20 @@ class LocalEvalSetsManager(EvalSetsManager):
|
|||||||
self._agent_dir = agent_dir
|
self._agent_dir = agent_dir
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def get_eval_set(self, app_name: str, eval_set_id: str) -> Any:
|
def get_eval_set(self, app_name: str, eval_set_id: str) -> EvalSet:
|
||||||
"""Returns an EvalSet identified by an app_name and eval_set_id."""
|
"""Returns an EvalSet identified by an app_name and eval_set_id."""
|
||||||
# Load the eval set file data
|
# Load the eval set file data
|
||||||
eval_set_file_path = self._get_eval_set_file_path(app_name, eval_set_id)
|
eval_set_file_path = self._get_eval_set_file_path(app_name, eval_set_id)
|
||||||
with open(eval_set_file_path, "r") as file:
|
with open(eval_set_file_path, "r", encoding="utf-8") as f:
|
||||||
return json.load(file) # Load JSON into a list
|
content = f.read()
|
||||||
|
try:
|
||||||
|
return EvalSet.model_validate_json(content)
|
||||||
|
except ValidationError:
|
||||||
|
# We assume that the eval data was specified in the old format and try
|
||||||
|
# to convert it to the new format.
|
||||||
|
return convert_eval_set_to_pydanctic_schema(
|
||||||
|
eval_set_id, json.loads(content)
|
||||||
|
)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def create_eval_set(self, app_name: str, eval_set_id: str):
|
def create_eval_set(self, app_name: str, eval_set_id: str):
|
||||||
@ -52,9 +189,13 @@ class LocalEvalSetsManager(EvalSetsManager):
|
|||||||
if not os.path.exists(new_eval_set_path):
|
if not os.path.exists(new_eval_set_path):
|
||||||
# Write the JSON string to the file
|
# Write the JSON string to the file
|
||||||
logger.info("Eval set file doesn't exist, we will create a new one.")
|
logger.info("Eval set file doesn't exist, we will create a new one.")
|
||||||
with open(new_eval_set_path, "w") as f:
|
new_eval_set = EvalSet(
|
||||||
empty_content = json.dumps([], indent=2)
|
eval_set_id=eval_set_id,
|
||||||
f.write(empty_content)
|
name=eval_set_id,
|
||||||
|
eval_cases=[],
|
||||||
|
creation_timestamp=time.time(),
|
||||||
|
)
|
||||||
|
self._write_eval_set(new_eval_set_path, new_eval_set)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def list_eval_sets(self, app_name: str) -> list[str]:
|
def list_eval_sets(self, app_name: str) -> list[str]:
|
||||||
@ -70,26 +211,23 @@ class LocalEvalSetsManager(EvalSetsManager):
|
|||||||
return sorted(eval_sets)
|
return sorted(eval_sets)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: Any):
|
def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: EvalCase):
|
||||||
"""Adds the given EvalCase to an existing EvalSet identified by app_name and eval_set_id."""
|
"""Adds the given EvalCase to an existing EvalSet identified by app_name and eval_set_id."""
|
||||||
eval_case_id = eval_case["name"]
|
eval_case_id = eval_case.eval_id
|
||||||
self._validate_id(id_name="Eval Case Id", id_value=eval_case_id)
|
self._validate_id(id_name="Eval Case Id", id_value=eval_case_id)
|
||||||
|
|
||||||
# Load the eval set file data
|
eval_set = self.get_eval_set(app_name, eval_set_id)
|
||||||
eval_set_file_path = self._get_eval_set_file_path(app_name, eval_set_id)
|
|
||||||
with open(eval_set_file_path, "r") as file:
|
|
||||||
eval_set_data = json.load(file) # Load JSON into a list
|
|
||||||
|
|
||||||
if [x for x in eval_set_data if x["name"] == eval_case_id]:
|
if [x for x in eval_set.eval_cases if x.eval_id == eval_case_id]:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Eval id `{eval_case_id}` already exists in `{eval_set_id}`"
|
f"Eval id `{eval_case_id}` already exists in `{eval_set_id}`"
|
||||||
" eval set.",
|
" eval set.",
|
||||||
)
|
)
|
||||||
|
|
||||||
eval_set_data.append(eval_case)
|
eval_set.eval_cases.append(eval_case)
|
||||||
# Serialize the test data to JSON and write to the eval set file.
|
|
||||||
with open(eval_set_file_path, "w") as f:
|
eval_set_file_path = self._get_eval_set_file_path(app_name, eval_set_id)
|
||||||
f.write(json.dumps(eval_set_data, indent=2))
|
self._write_eval_set(eval_set_file_path, eval_set)
|
||||||
|
|
||||||
def _get_eval_set_file_path(self, app_name: str, eval_set_id: str) -> str:
|
def _get_eval_set_file_path(self, app_name: str, eval_set_id: str) -> str:
|
||||||
return os.path.join(
|
return os.path.join(
|
||||||
@ -104,3 +242,7 @@ class LocalEvalSetsManager(EvalSetsManager):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Invalid {id_name}. {id_name} should have the `{pattern}` format",
|
f"Invalid {id_name}. {id_name} should have the `{pattern}` format",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _write_eval_set(self, eval_set_path: str, eval_set: EvalSet):
|
||||||
|
with open(eval_set_path, "w") as f:
|
||||||
|
f.write(eval_set.model_dump_json(indent=2))
|
||||||
|
Loading…
Reference in New Issue
Block a user