Merge branch 'main' into main

This commit is contained in:
Hangfei Lin 2025-04-10 18:04:20 -07:00 committed by GitHub
commit e80d0e4993
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 5 additions and 5 deletions

View File

@ -66,7 +66,7 @@ def convert_session_to_eval_format(session: Session) -> list[dict[str, Any]]:
'tool_input': tool_input, 'tool_input': tool_input,
}) })
elif subsequent_part.text: elif subsequent_part.text:
# Also keep track of all the natural langauge responses that # Also keep track of all the natural language responses that
# agent (or sub agents) generated. # agent (or sub agents) generated.
intermediate_agent_responses.append( intermediate_agent_responses.append(
{'author': event_author, 'text': subsequent_part.text} {'author': event_author, 'text': subsequent_part.text}
@ -75,7 +75,7 @@ def convert_session_to_eval_format(session: Session) -> list[dict[str, Any]]:
# If we are here then either we are done reading all the events or we # If we are here then either we are done reading all the events or we
# encountered an event that had content authored by the end-user. # encountered an event that had content authored by the end-user.
# This, basically means an end of turn. # This, basically means an end of turn.
# We assume that the last natural langauge intermediate response is the # We assume that the last natural language intermediate response is the
# final response from the agent/model. We treat that as a reference. # final response from the agent/model. We treat that as a reference.
eval_case.append({ eval_case.append({
'query': query, 'query': query,

View File

@ -42,7 +42,7 @@ class ResponseEvaluator:
A note on evaluation_criteria: A note on evaluation_criteria:
`response_match_score`: This metric compares the agents final natural `response_match_score`: This metric compares the agents final natural
language reponse with the expected final response, stored in the language response with the expected final response, stored in the
"reference" field in test/eval files. We use Rouge metric to compare the "reference" field in test/eval files. We use Rouge metric to compare the
two responses. two responses.

View File

@ -310,7 +310,7 @@ def _merge_function_response_events(
function_response_events: A list of function_response events. function_response_events: A list of function_response events.
NOTE: function_response_events must fulfill these requirements: 1. The NOTE: function_response_events must fulfill these requirements: 1. The
list is in increasing order of timestamp; 2. the first event is the list is in increasing order of timestamp; 2. the first event is the
initial function_reponse event; 3. all later events should contain at initial function_response event; 3. all later events should contain at
least one function_response part that related to the function_call least one function_response part that related to the function_call
event. (Note, 3. may not be true when aync function return some event. (Note, 3. may not be true when aync function return some
intermediate response, there could also be some intermediate model intermediate response, there could also be some intermediate model

View File

@ -231,7 +231,7 @@ class MockModel(BaseLlm):
if not responses: if not responses:
return cls(responses=[]) return cls(responses=[])
elif isinstance(responses[0], LlmResponse): elif isinstance(responses[0], LlmResponse):
# reponses is list[LlmResponse] # responses is list[LlmResponse]
return cls(responses=responses) return cls(responses=responses)
else: else:
responses = [ responses = [