Update Eval Set Manager to use new EvalSet and EvalCase schema.

PiperOrigin-RevId: 758921365
This commit is contained in:
Ankur Sharma 2025-05-14 18:51:03 -07:00 committed by Copybara-Service
parent 293f406148
commit 2cb74dd20e
5 changed files with 184 additions and 38 deletions

View File

@ -60,6 +60,8 @@ from ..agents.llm_agent import Agent
from ..agents.llm_agent import LlmAgent from ..agents.llm_agent import LlmAgent
from ..agents.run_config import StreamingMode from ..agents.run_config import StreamingMode
from ..artifacts import InMemoryArtifactService from ..artifacts import InMemoryArtifactService
from ..evaluation.eval_case import EvalCase
from ..evaluation.eval_case import SessionInput
from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
from ..events.event import Event from ..events.event import Event
from ..memory.in_memory_memory_service import InMemoryMemoryService from ..memory.in_memory_memory_service import InMemoryMemoryService
@ -436,25 +438,25 @@ def get_fast_api_app(
) )
assert session, "Session not found." assert session, "Session not found."
# Convert the session data to evaluation format # Convert the session data to eval invocations
test_data = evals.convert_session_to_eval_format(session) invocations = evals.convert_session_to_eval_invocations(session)
# Populate the session with initial session state. # Populate the session with initial session state.
initial_session_state = create_empty_state( initial_session_state = create_empty_state(
await _get_root_agent_async(app_name) await _get_root_agent_async(app_name)
) )
eval_case = { new_eval_case = EvalCase(
"name": req.eval_id, eval_id=req.eval_id,
"data": test_data, conversation=invocations,
"initial_session": { session_input=SessionInput(
"state": initial_session_state, app_name=app_name, user_id=req.user_id, state=initial_session_state
"app_name": app_name, ),
"user_id": req.user_id, creation_timestamp=time.time(),
}, )
}
try: try:
eval_sets_manager.add_eval_case(app_name, eval_set_id, eval_case) eval_sets_manager.add_eval_case(app_name, eval_set_id, new_eval_case)
except ValueError as ve: except ValueError as ve:
raise HTTPException(status_code=400, detail=str(ve)) from ve raise HTTPException(status_code=400, detail=str(ve)) from ve
@ -469,7 +471,7 @@ def get_fast_api_app(
"""Lists all evals in an eval set.""" """Lists all evals in an eval set."""
eval_set_data = eval_sets_manager.get_eval_set(app_name, eval_set_id) eval_set_data = eval_sets_manager.get_eval_set(app_name, eval_set_id)
return sorted([x["name"] for x in eval_set_data]) return sorted([x.eval_id for x in eval_set_data.eval_cases])
@app.post( @app.post(
"/apps/{app_name}/eval_sets/{eval_set_id}/run_eval", "/apps/{app_name}/eval_sets/{eval_set_id}/run_eval",

View File

@ -47,10 +47,10 @@ class Invocation(BaseModel):
user_content: genai_types.Content user_content: genai_types.Content
"""Content provided by the user in this invocation.""" """Content provided by the user in this invocation."""
final_response: Optional[genai_types.Content] final_response: Optional[genai_types.Content] = None
"""Final response from the agent that acts a reference or benchmark.""" """Final response from the agent that acts a reference or benchmark."""
intermediate_data: IntermediateData intermediate_data: Optional[IntermediateData] = None
"""Reference intermediate steps generated as a part of Agent execution. """Reference intermediate steps generated as a part of Agent execution.
For a multi-agent system, it is also helpful to inspect the route that For a multi-agent system, it is also helpful to inspect the route that
@ -83,7 +83,7 @@ class EvalCase(BaseModel):
conversation: list[Invocation] conversation: list[Invocation]
"""A conversation between the user and the Agent. The conversation can have any number of invocations.""" """A conversation between the user and the Agent. The conversation can have any number of invocations."""
session_input: SessionInput session_input: Optional[SessionInput] = None
"""Session input that will be passed on to the Agent during eval. """Session input that will be passed on to the Agent during eval.
It is common for Agents state to be initialized to some initial/default value, It is common for Agents state to be initialized to some initial/default value,
for example, your agent may need to know today's date. for example, your agent may need to know today's date.

View File

@ -23,10 +23,10 @@ class EvalSet(BaseModel):
eval_set_id: str eval_set_id: str
"""Unique identifier for the eval set.""" """Unique identifier for the eval set."""
name: Optional[str] name: Optional[str] = None
"""Name of the dataset.""" """Name of the dataset."""
description: Optional[str] description: Optional[str] = None
"""Description of the dataset.""" """Description of the dataset."""
eval_cases: list[EvalCase] eval_cases: list[EvalCase]

View File

@ -13,14 +13,16 @@
# limitations under the License. # limitations under the License.
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any
from .eval_case import EvalCase
from .eval_set import EvalSet
class EvalSetsManager(ABC): class EvalSetsManager(ABC):
"""An interface to manage an Eval Sets.""" """An interface to manage an Eval Sets."""
@abstractmethod @abstractmethod
def get_eval_set(self, app_name: str, eval_set_id: str) -> Any: def get_eval_set(self, app_name: str, eval_set_id: str) -> EvalSet:
"""Returns an EvalSet identified by an app_name and eval_set_id.""" """Returns an EvalSet identified by an app_name and eval_set_id."""
raise NotImplementedError() raise NotImplementedError()
@ -35,6 +37,6 @@ class EvalSetsManager(ABC):
raise NotImplementedError() raise NotImplementedError()
@abstractmethod @abstractmethod
def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: Any): def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: EvalCase):
"""Adds the given EvalCase to an existing EvalSet identified by app_name and eval_set_id.""" """Adds the given EvalCase to an existing EvalSet identified by app_name and eval_set_id."""
raise NotImplementedError() raise NotImplementedError()

View File

@ -16,8 +16,17 @@ import json
import logging import logging
import os import os
import re import re
import time
from typing import Any from typing import Any
import uuid
from google.genai import types as genai_types
from pydantic import ValidationError
from typing_extensions import override from typing_extensions import override
from .eval_case import EvalCase
from .eval_case import IntermediateData
from .eval_case import Invocation
from .eval_case import SessionInput
from .eval_set import EvalSet
from .eval_sets_manager import EvalSetsManager from .eval_sets_manager import EvalSetsManager
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -25,6 +34,126 @@ logger = logging.getLogger(__name__)
_EVAL_SET_FILE_EXTENSION = ".evalset.json" _EVAL_SET_FILE_EXTENSION = ".evalset.json"
def _convert_invocation_to_pydantic_schema(
invocation_in_json_format: dict[str, Any],
) -> Invocation:
"""Converts an invocation from old json format to new Pydantic Schema"""
query = invocation_in_json_format["query"]
reference = invocation_in_json_format["reference"]
expected_tool_use = []
expected_intermediate_agent_responses = []
for old_tool_use in invocation_in_json_format["expected_tool_use"]:
expected_tool_use.append(
genai_types.FunctionCall(
name=old_tool_use["tool_name"], args=old_tool_use["tool_input"]
)
)
for old_intermediate_response in invocation_in_json_format[
"expected_intermediate_agent_responses"
]:
expected_intermediate_agent_responses.append((
old_intermediate_response["author"],
[genai_types.Part.from_text(text=old_intermediate_response["text"])],
))
return Invocation(
invocation_id=str(uuid.uuid4()),
user_content=genai_types.Content(
parts=[genai_types.Part.from_text(text=query)], role="user"
),
final_response=genai_types.Content(
parts=[genai_types.Part.from_text(text=reference)], role="model"
),
intermediate_data=IntermediateData(
tool_uses=expected_tool_use,
intermediate_responses=expected_intermediate_agent_responses,
),
creation_timestamp=time.time(),
)
def convert_eval_set_to_pydanctic_schema(
eval_set_id: str,
eval_set_in_json_format: list[dict[str, Any]],
) -> EvalSet:
r"""Returns an pydantic EvalSet generated from the json representation.
Args:
eval_set_id: Eval set id.
eval_set_in_json_format: Eval set specified in JSON format.
Here is a sample eval set in JSON format:
[
{
"name": "roll_17_sided_dice_twice",
"data": [
{
"query": "What can you do?",
"expected_tool_use": [],
"expected_intermediate_agent_responses": [],
"reference": "I can roll dice of different sizes and check if a number
is prime. I can also use multiple tools in parallel.\n"
},
{
"query": "Roll a 17 sided dice twice for me",
"expected_tool_use": [
{
"tool_name": "roll_die",
"tool_input": {
"sides": 17
}
},
{
"tool_name": "roll_die",
"tool_input": {
"sides": 17
}
}
],
"expected_intermediate_agent_responses": [],
"reference": "I have rolled a 17 sided die twice. The first roll was
13 and the second roll was 4.\n"
}
],
"initial_session": {
"state": {},
"app_name": "hello_world",
"user_id": "user"
}
}
]
"""
eval_cases = []
for old_eval_case in eval_set_in_json_format:
new_invocations = []
for old_invocation in old_eval_case["data"]:
new_invocations.append(
_convert_invocation_to_pydantic_schema(old_invocation)
)
new_eval_case = EvalCase(
eval_id=old_eval_case["name"],
conversation=new_invocations,
session_input=SessionInput(
app_name=old_eval_case["initial_session"]["app_name"],
user_id=old_eval_case["initial_session"]["user_id"],
state=old_eval_case["initial_session"]["state"],
),
creation_timestamp=time.time(),
)
eval_cases.append(new_eval_case)
return EvalSet(
eval_set_id=eval_set_id,
name=eval_set_id,
creation_timestamp=time.time(),
eval_cases=eval_cases,
)
class LocalEvalSetsManager(EvalSetsManager): class LocalEvalSetsManager(EvalSetsManager):
"""An EvalSets manager that stores eval sets locally on disk.""" """An EvalSets manager that stores eval sets locally on disk."""
@ -32,12 +161,20 @@ class LocalEvalSetsManager(EvalSetsManager):
self._agent_dir = agent_dir self._agent_dir = agent_dir
@override @override
def get_eval_set(self, app_name: str, eval_set_id: str) -> Any: def get_eval_set(self, app_name: str, eval_set_id: str) -> EvalSet:
"""Returns an EvalSet identified by an app_name and eval_set_id.""" """Returns an EvalSet identified by an app_name and eval_set_id."""
# Load the eval set file data # Load the eval set file data
eval_set_file_path = self._get_eval_set_file_path(app_name, eval_set_id) eval_set_file_path = self._get_eval_set_file_path(app_name, eval_set_id)
with open(eval_set_file_path, "r") as file: with open(eval_set_file_path, "r", encoding="utf-8") as f:
return json.load(file) # Load JSON into a list content = f.read()
try:
return EvalSet.model_validate_json(content)
except ValidationError:
# We assume that the eval data was specified in the old format and try
# to convert it to the new format.
return convert_eval_set_to_pydanctic_schema(
eval_set_id, json.loads(content)
)
@override @override
def create_eval_set(self, app_name: str, eval_set_id: str): def create_eval_set(self, app_name: str, eval_set_id: str):
@ -52,9 +189,13 @@ class LocalEvalSetsManager(EvalSetsManager):
if not os.path.exists(new_eval_set_path): if not os.path.exists(new_eval_set_path):
# Write the JSON string to the file # Write the JSON string to the file
logger.info("Eval set file doesn't exist, we will create a new one.") logger.info("Eval set file doesn't exist, we will create a new one.")
with open(new_eval_set_path, "w") as f: new_eval_set = EvalSet(
empty_content = json.dumps([], indent=2) eval_set_id=eval_set_id,
f.write(empty_content) name=eval_set_id,
eval_cases=[],
creation_timestamp=time.time(),
)
self._write_eval_set(new_eval_set_path, new_eval_set)
@override @override
def list_eval_sets(self, app_name: str) -> list[str]: def list_eval_sets(self, app_name: str) -> list[str]:
@ -70,26 +211,23 @@ class LocalEvalSetsManager(EvalSetsManager):
return sorted(eval_sets) return sorted(eval_sets)
@override @override
def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: Any): def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: EvalCase):
"""Adds the given EvalCase to an existing EvalSet identified by app_name and eval_set_id.""" """Adds the given EvalCase to an existing EvalSet identified by app_name and eval_set_id."""
eval_case_id = eval_case["name"] eval_case_id = eval_case.eval_id
self._validate_id(id_name="Eval Case Id", id_value=eval_case_id) self._validate_id(id_name="Eval Case Id", id_value=eval_case_id)
# Load the eval set file data eval_set = self.get_eval_set(app_name, eval_set_id)
eval_set_file_path = self._get_eval_set_file_path(app_name, eval_set_id)
with open(eval_set_file_path, "r") as file:
eval_set_data = json.load(file) # Load JSON into a list
if [x for x in eval_set_data if x["name"] == eval_case_id]: if [x for x in eval_set.eval_cases if x.eval_id == eval_case_id]:
raise ValueError( raise ValueError(
f"Eval id `{eval_case_id}` already exists in `{eval_set_id}`" f"Eval id `{eval_case_id}` already exists in `{eval_set_id}`"
" eval set.", " eval set.",
) )
eval_set_data.append(eval_case) eval_set.eval_cases.append(eval_case)
# Serialize the test data to JSON and write to the eval set file.
with open(eval_set_file_path, "w") as f: eval_set_file_path = self._get_eval_set_file_path(app_name, eval_set_id)
f.write(json.dumps(eval_set_data, indent=2)) self._write_eval_set(eval_set_file_path, eval_set)
def _get_eval_set_file_path(self, app_name: str, eval_set_id: str) -> str: def _get_eval_set_file_path(self, app_name: str, eval_set_id: str) -> str:
return os.path.join( return os.path.join(
@ -104,3 +242,7 @@ class LocalEvalSetsManager(EvalSetsManager):
raise ValueError( raise ValueError(
f"Invalid {id_name}. {id_name} should have the `{pattern}` format", f"Invalid {id_name}. {id_name} should have the `{pattern}` format",
) )
def _write_eval_set(self, eval_set_path: str, eval_set: EvalSet):
with open(eval_set_path, "w") as f:
f.write(eval_set.model_dump_json(indent=2))