No public description

PiperOrigin-RevId: 748777998
2026-02-03 21:36:24 -06:00 · 2025-04-17 19:50:22 +00:00
parent 290058eb05
commit 61d4be2d76
99 changed files with 2120 additions and 256 deletions
@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,259 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the Response Evaluator."""
+from unittest.mock import MagicMock
+from unittest.mock import patch
+
+from google.adk.evaluation.response_evaluator import ResponseEvaluator
+import pandas as pd
+import pytest
+from vertexai.preview.evaluation import MetricPromptTemplateExamples
+
+# Mock object for the result normally returned by _perform_eval
+MOCK_EVAL_RESULT = MagicMock()
+MOCK_EVAL_RESULT.summary_metrics = {"mock_metric": 0.75, "another_mock": 3.5}
+# Add a metrics_table for testing _print_results interaction
+MOCK_EVAL_RESULT.metrics_table = pd.DataFrame({
+    "prompt": ["mock_query1"],
+    "response": ["mock_resp1"],
+    "mock_metric": [0.75],
+})
+
+SAMPLE_TURN_1_ALL_KEYS = {
+    "query": "query1",
+    "response": "response1",
+    "actual_tool_use": [{"tool_name": "tool_a", "tool_input": {}}],
+    "expected_tool_use": [{"tool_name": "tool_a", "tool_input": {}}],
+    "reference": "reference1",
+}
+SAMPLE_TURN_2_MISSING_REF = {
+    "query": "query2",
+    "response": "response2",
+    "actual_tool_use": [],
+    "expected_tool_use": [],
+    # "reference": "reference2" # Missing
+}
+SAMPLE_TURN_3_MISSING_EXP_TOOLS = {
+    "query": "query3",
+    "response": "response3",
+    "actual_tool_use": [{"tool_name": "tool_b", "tool_input": {}}],
+    # "expected_tool_use": [], # Missing
+    "reference": "reference3",
+}
+SAMPLE_TURN_4_MINIMAL = {
+    "query": "query4",
+    "response": "response4",
+    # Minimal keys, others missing
+}
+
+
+@patch(
+    "google.adk.evaluation.response_evaluator.ResponseEvaluator._perform_eval"
+)
+class TestResponseEvaluator:
+  """A class to help organize "patch" that are applicabple to all tests."""
+
+  def test_evaluate_none_dataset_raises_value_error(self, mock_perform_eval):
+    """Test evaluate function raises ValueError for an empty list."""
+    with pytest.raises(ValueError, match="The evaluation dataset is empty."):
+      ResponseEvaluator.evaluate(None, ["response_evaluation_score"])
+    mock_perform_eval.assert_not_called()  # Ensure _perform_eval was not called
+
+  def test_evaluate_empty_dataset_raises_value_error(self, mock_perform_eval):
+    """Test evaluate function raises ValueError for an empty list."""
+    with pytest.raises(ValueError, match="The evaluation dataset is empty."):
+      ResponseEvaluator.evaluate([], ["response_evaluation_score"])
+    mock_perform_eval.assert_not_called()  # Ensure _perform_eval was not called
+
+  def test_evaluate_determines_metrics_correctly_for_perform_eval(
+      self, mock_perform_eval
+  ):
+    """Test that the correct metrics list is passed to _perform_eval based on criteria/keys."""
+    mock_perform_eval.return_value = MOCK_EVAL_RESULT
+
+    # Test case 1: Only Coherence
+    raw_data_1 = [[SAMPLE_TURN_1_ALL_KEYS]]
+    criteria_1 = ["response_evaluation_score"]
+    ResponseEvaluator.evaluate(raw_data_1, criteria_1)
+    _, kwargs = mock_perform_eval.call_args
+    assert kwargs["metrics"] == [
+        MetricPromptTemplateExamples.Pointwise.COHERENCE
+    ]
+    mock_perform_eval.reset_mock()  # Reset mock for next call
+
+    # Test case 2: Only Rouge
+    raw_data_2 = [[SAMPLE_TURN_1_ALL_KEYS]]
+    criteria_2 = ["response_match_score"]
+    ResponseEvaluator.evaluate(raw_data_2, criteria_2)
+    _, kwargs = mock_perform_eval.call_args
+    assert kwargs["metrics"] == ["rouge_1"]
+    mock_perform_eval.reset_mock()
+
+    # Test case 3: No metrics if keys missing in first turn
+    raw_data_3 = [[SAMPLE_TURN_4_MINIMAL, SAMPLE_TURN_1_ALL_KEYS]]
+    criteria_3 = ["response_evaluation_score", "response_match_score"]
+    ResponseEvaluator.evaluate(raw_data_3, criteria_3)
+    _, kwargs = mock_perform_eval.call_args
+    assert kwargs["metrics"] == []
+    mock_perform_eval.reset_mock()
+
+    # Test case 4: No metrics if criteria empty
+    raw_data_4 = [[SAMPLE_TURN_1_ALL_KEYS]]
+    criteria_4 = []
+    ResponseEvaluator.evaluate(raw_data_4, criteria_4)
+    _, kwargs = mock_perform_eval.call_args
+    assert kwargs["metrics"] == []
+    mock_perform_eval.reset_mock()
+
+  def test_evaluate_calls_perform_eval_correctly_all_metrics(
+      self, mock_perform_eval
+  ):
+    """Test evaluate function calls _perform_eval with expected args when all criteria/keys are present."""
+    # Arrange
+    mock_perform_eval.return_value = (
+        MOCK_EVAL_RESULT  # Configure the mock return value
+    )
+
+    raw_data = [[SAMPLE_TURN_1_ALL_KEYS]]
+    criteria = ["response_evaluation_score", "response_match_score"]
+
+    # Act
+    summary = ResponseEvaluator.evaluate(raw_data, criteria)
+
+    # Assert
+    # 1. Check metrics determined by _get_metrics (passed to _perform_eval)
+    expected_metrics_list = [
+        MetricPromptTemplateExamples.Pointwise.COHERENCE,
+        "rouge_1",
+    ]
+    # 2. Check DataFrame prepared (passed to _perform_eval)
+    expected_df_data = [{
+        "prompt": "query1",
+        "response": "response1",
+        "actual_tool_use": [{"tool_name": "tool_a", "tool_input": {}}],
+        "reference_trajectory": [{"tool_name": "tool_a", "tool_input": {}}],
+        "reference": "reference1",
+    }]
+    expected_df = pd.DataFrame(expected_df_data)
+
+    # Assert _perform_eval was called once
+    mock_perform_eval.assert_called_once()
+    # Get the arguments passed to the mocked _perform_eval
+    _, kwargs = mock_perform_eval.call_args
+    # Check the 'dataset' keyword argument
+    pd.testing.assert_frame_equal(kwargs["dataset"], expected_df)
+    # Check the 'metrics' keyword argument
+    assert kwargs["metrics"] == expected_metrics_list
+
+    # 3. Check the correct summary metrics are returned
+    # (from mock_perform_eval's return value)
+    assert summary == MOCK_EVAL_RESULT.summary_metrics
+
+  def test_evaluate_prepares_dataframe_correctly_for_perform_eval(
+      self, mock_perform_eval
+  ):
+    """Test that the DataFrame is correctly flattened and renamed before passing to _perform_eval."""
+    mock_perform_eval.return_value = MOCK_EVAL_RESULT
+
+    raw_data = [
+        [SAMPLE_TURN_1_ALL_KEYS],  # Conversation 1
+        [
+            SAMPLE_TURN_2_MISSING_REF,
+            SAMPLE_TURN_3_MISSING_EXP_TOOLS,
+        ],  # Conversation 2
+    ]
+    criteria = [
+        "response_match_score"
+    ]  # Doesn't affect the DataFrame structure
+
+    ResponseEvaluator.evaluate(raw_data, criteria)
+
+    # Expected flattened and renamed data
+    expected_df_data = [
+        # Turn 1 (from SAMPLE_TURN_1_ALL_KEYS)
+        {
+            "prompt": "query1",
+            "response": "response1",
+            "actual_tool_use": [{"tool_name": "tool_a", "tool_input": {}}],
+            "reference_trajectory": [{"tool_name": "tool_a", "tool_input": {}}],
+            "reference": "reference1",
+        },
+        # Turn 2 (from SAMPLE_TURN_2_MISSING_REF)
+        {
+            "prompt": "query2",
+            "response": "response2",
+            "actual_tool_use": [],
+            "reference_trajectory": [],
+            # "reference": None # Missing key results in NaN in DataFrame
+            # usually
+        },
+        # Turn 3 (from SAMPLE_TURN_3_MISSING_EXP_TOOLS)
+        {
+            "prompt": "query3",
+            "response": "response3",
+            "actual_tool_use": [{"tool_name": "tool_b", "tool_input": {}}],
+            # "reference_trajectory": None, # Missing key results in NaN
+            "reference": "reference3",
+        },
+    ]
+    # Need to be careful with missing keys -> NaN when creating DataFrame
+    # Pandas handles this automatically when creating from list of dicts
+    expected_df = pd.DataFrame(expected_df_data)
+
+    mock_perform_eval.assert_called_once()
+    _, kwargs = mock_perform_eval.call_args
+    # Compare the DataFrame passed to the mock
+    pd.testing.assert_frame_equal(kwargs["dataset"], expected_df)
+
+  @patch(
+      "google.adk.evaluation.response_evaluator.ResponseEvaluator._print_results"
+  )  # Mock the private print method
+  def test_evaluate_print_detailed_results(
+      self, mock_print_results, mock_perform_eval
+  ):
+    """Test _print_results function is called when print_detailed_results=True."""
+    mock_perform_eval.return_value = (
+        MOCK_EVAL_RESULT  # Ensure _perform_eval returns our mock result
+    )
+
+    raw_data = [[SAMPLE_TURN_1_ALL_KEYS]]
+    criteria = ["response_match_score"]
+
+    ResponseEvaluator.evaluate(raw_data, criteria, print_detailed_results=True)
+
+    # Assert _perform_eval was called
+    mock_perform_eval.assert_called_once()
+    # Assert _print_results was called once with the result object
+    # from _perform_eval
+    mock_print_results.assert_called_once_with(MOCK_EVAL_RESULT)
+
+  @patch(
+      "google.adk.evaluation.response_evaluator.ResponseEvaluator._print_results"
+  )
+  def test_evaluate_no_print_detailed_results(
+      self, mock_print_results, mock_perform_eval
+  ):
+    """Test _print_results function is NOT called when print_detailed_results=False (default)."""
+    mock_perform_eval.return_value = MOCK_EVAL_RESULT
+
+    raw_data = [[SAMPLE_TURN_1_ALL_KEYS]]
+    criteria = ["response_match_score"]
+
+    ResponseEvaluator.evaluate(raw_data, criteria, print_detailed_results=False)
+
+    # Assert _perform_eval was called
+    mock_perform_eval.assert_called_once()
+    # Assert _print_results was NOT called
+    mock_print_results.assert_not_called()
@@ -0,0 +1,271 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Testings for the Trajectory Evaluator."""
+
+import math
+from google.adk.evaluation.trajectory_evaluator import TrajectoryEvaluator
+import pytest
+
+# Define reusable tool call structures
+TOOL_ROLL_DICE_16 = {"tool_name": "roll_die", "tool_input": {"sides": 16}}
+TOOL_ROLL_DICE_6 = {"tool_name": "roll_die", "tool_input": {"sides": 6}}
+TOOL_GET_WEATHER = {
+    "tool_name": "get_weather",
+    "tool_input": {"location": "Paris"},
+}
+TOOL_GET_WEATHER_SF = {
+    "tool_name": "get_weather",
+    "tool_input": {"location": "SF"},
+}
+
+# Sample data for turns
+TURN_MATCH = {
+    "query": "Q1",
+    "response": "R1",
+    "actual_tool_use": [TOOL_ROLL_DICE_16],
+    "expected_tool_use": [TOOL_ROLL_DICE_16],
+}
+TURN_MISMATCH_INPUT = {
+    "query": "Q2",
+    "response": "R2",
+    "actual_tool_use": [TOOL_ROLL_DICE_6],
+    "expected_tool_use": [TOOL_ROLL_DICE_16],
+}
+TURN_MISMATCH_NAME = {
+    "query": "Q3",
+    "response": "R3",
+    "actual_tool_use": [TOOL_GET_WEATHER],
+    "expected_tool_use": [TOOL_ROLL_DICE_16],
+}
+TURN_MATCH_MULTIPLE = {
+    "query": "Q4",
+    "response": "R4",
+    "actual_tool_use": [TOOL_GET_WEATHER, TOOL_ROLL_DICE_6],
+    "expected_tool_use": [TOOL_GET_WEATHER, TOOL_ROLL_DICE_6],
+}
+TURN_MISMATCH_ORDER = {
+    "query": "Q5",
+    "response": "R5",
+    "actual_tool_use": [TOOL_ROLL_DICE_6, TOOL_GET_WEATHER],
+    "expected_tool_use": [TOOL_GET_WEATHER, TOOL_ROLL_DICE_6],
+}
+TURN_MISMATCH_LENGTH_ACTUAL_LONGER = {
+    "query": "Q6",
+    "response": "R6",
+    "actual_tool_use": [TOOL_GET_WEATHER, TOOL_ROLL_DICE_6],
+    "expected_tool_use": [TOOL_GET_WEATHER],
+}
+TURN_MISMATCH_LENGTH_EXPECTED_LONGER = {
+    "query": "Q7",
+    "response": "R7",
+    "actual_tool_use": [TOOL_GET_WEATHER],
+    "expected_tool_use": [TOOL_GET_WEATHER, TOOL_ROLL_DICE_6],
+}
+TURN_MATCH_WITH_MOCK_OUTPUT = {
+    "query": "Q8",
+    "response": "R8",
+    "actual_tool_use": [TOOL_GET_WEATHER_SF],
+    "expected_tool_use": [
+        {**TOOL_GET_WEATHER_SF, "mock_tool_output": "Sunny"}
+    ],  # Add mock output to expected
+}
+TURN_MATCH_EMPTY_TOOLS = {
+    "query": "Q9",
+    "response": "R9",
+    "actual_tool_use": [],
+    "expected_tool_use": [],
+}
+TURN_MISMATCH_EMPTY_VS_NONEMPTY = {
+    "query": "Q10",
+    "response": "R10",
+    "actual_tool_use": [],
+    "expected_tool_use": [TOOL_GET_WEATHER],
+}
+
+
+def test_evaluate_none_dataset_raises_value_error():
+  """Tests evaluate function raises ValueError for an empty list."""
+  with pytest.raises(ValueError, match="The evaluation dataset is empty."):
+    TrajectoryEvaluator.evaluate(None)
+
+
+def test_evaluate_empty_dataset_raises_value_error():
+  """Tests evaluate function raises ValueError for an empty list."""
+  with pytest.raises(ValueError, match="The evaluation dataset is empty."):
+    TrajectoryEvaluator.evaluate([])
+
+
+def test_evaluate_single_turn_match():
+  """Tests evaluate function with one conversation, one turn, perfect match."""
+  eval_dataset = [[TURN_MATCH]]
+  assert TrajectoryEvaluator.evaluate(eval_dataset) == 1.0
+
+
+def test_evaluate_single_turn_mismatch():
+  """Tests evaluate function with one conversation, one turn, mismatch."""
+  eval_dataset = [[TURN_MISMATCH_INPUT]]
+  assert TrajectoryEvaluator.evaluate(eval_dataset) == 0.0
+
+
+def test_evaluate_multiple_turns_all_match():
+  """Tests evaluate function with one conversation, multiple turns, all match."""
+  eval_dataset = [[TURN_MATCH, TURN_MATCH_MULTIPLE, TURN_MATCH_EMPTY_TOOLS]]
+  assert TrajectoryEvaluator.evaluate(eval_dataset) == 1.0
+
+
+def test_evaluate_multiple_turns_mixed():
+  """Tests evaluate function with one conversation, mixed match/mismatch turns."""
+  eval_dataset = [
+      [TURN_MATCH, TURN_MISMATCH_NAME, TURN_MATCH_MULTIPLE, TURN_MISMATCH_ORDER]
+  ]
+  # Expected: (1.0 + 0.0 + 1.0 + 0.0) / 4 = 0.5
+  assert TrajectoryEvaluator.evaluate(eval_dataset) == 0.5
+
+
+def test_evaluate_multiple_conversations_mixed():
+  """Tests evaluate function with multiple conversations, mixed turns."""
+  eval_dataset = [
+      [TURN_MATCH, TURN_MISMATCH_INPUT],  # Conv 1: 1.0, 0.0 -> Avg 0.5
+      [TURN_MATCH_MULTIPLE],  # Conv 2: 1.0 -> Avg 1.0
+      [
+          TURN_MISMATCH_ORDER,
+          TURN_MISMATCH_LENGTH_ACTUAL_LONGER,
+          TURN_MATCH,
+      ],  # Conv 3: 0.0, 0.0, 1.0 -> Avg 1/3
+  ]
+  # Expected: (1.0 + 0.0 + 1.0 + 0.0 + 0.0 + 1.0) / 6 = 3.0 / 6 = 0.5
+  assert TrajectoryEvaluator.evaluate(eval_dataset) == 0.5
+
+
+def test_evaluate_ignores_mock_tool_output_in_expected():
+  """Tests evaluate function correctly compares even if expected has mock_tool_output."""
+  eval_dataset = [[TURN_MATCH_WITH_MOCK_OUTPUT]]
+  assert TrajectoryEvaluator.evaluate(eval_dataset) == 1.0
+
+
+def test_evaluate_match_empty_tool_lists():
+  """Tests evaluate function correctly matches empty tool lists."""
+  eval_dataset = [[TURN_MATCH_EMPTY_TOOLS]]
+  assert TrajectoryEvaluator.evaluate(eval_dataset) == 1.0
+
+
+def test_evaluate_mismatch_empty_vs_nonempty():
+  """Tests evaluate function correctly mismatches empty vs non-empty tool lists."""
+  eval_dataset = [[TURN_MISMATCH_EMPTY_VS_NONEMPTY]]
+  assert TrajectoryEvaluator.evaluate(eval_dataset) == 0.0
+  eval_dataset_rev = [[{
+      **TURN_MISMATCH_EMPTY_VS_NONEMPTY,  # Swap actual/expected
+      "actual_tool_use": [TOOL_GET_WEATHER],
+      "expected_tool_use": [],
+  }]]
+  assert TrajectoryEvaluator.evaluate(eval_dataset_rev) == 0.0
+
+
+def test_evaluate_dataset_with_empty_conversation():
+  """Tests evaluate function handles dataset containing an empty conversation list."""
+  eval_dataset = [[TURN_MATCH], []]  # One valid conversation, one empty
+  # Should only evaluate the first conversation -> 1.0 / 1 turn = 1.0
+  assert TrajectoryEvaluator.evaluate(eval_dataset) == 1.0
+
+
+def test_evaluate_dataset_only_empty_conversation():
+  """Tests evaluate function handles dataset with only an empty conversation."""
+  eval_dataset = [[]]
+  # No rows evaluated, mean of empty series is NaN
+  # Depending on desired behavior, this could be 0.0 or NaN. The code returns
+  # NaN.
+  assert math.isnan(TrajectoryEvaluator.evaluate(eval_dataset))
+
+
+def test_evaluate_print_detailed_results(capsys):
+  """Tests evaluate function runs with print_detailed_results=True and prints something."""
+  eval_dataset = [[TURN_MATCH, TURN_MISMATCH_INPUT]]
+  TrajectoryEvaluator.evaluate(eval_dataset, print_detailed_results=True)
+  captured = capsys.readouterr()
+  assert "query" in captured.out  # Check if the results table header is printed
+  assert "R1" in captured.out  # Check if some data is printed
+  assert "Failures:" in captured.out  # Check if failures header is printed
+  assert "Q2" in captured.out  # Check if the failing query is printed
+
+
+def test_evaluate_no_failures_print(capsys):
+  """Tests evaluate function does not print Failures section when all turns match."""
+  eval_dataset = [[TURN_MATCH]]
+  TrajectoryEvaluator.evaluate(eval_dataset, print_detailed_results=True)
+  captured = capsys.readouterr()
+  assert "query" in captured.out  # Results table should still print
+  assert "Failures:" not in captured.out  # Failures section should NOT print
+
+
+def test_are_tools_equal_identical():
+  """Tests are_tools_equal function with identical lists."""
+  list_a = [TOOL_GET_WEATHER, TOOL_ROLL_DICE_6]
+  list_b = [TOOL_GET_WEATHER, TOOL_ROLL_DICE_6]
+  assert TrajectoryEvaluator.are_tools_equal(list_a, list_b)
+
+
+def test_are_tools_equal_empty():
+  """Tests are_tools_equal function with empty lists."""
+  assert TrajectoryEvaluator.are_tools_equal([], [])
+
+
+def test_are_tools_equal_different_order():
+  """Tests are_tools_equal function with same tools, different order."""
+  list_a = [TOOL_ROLL_DICE_6, TOOL_GET_WEATHER]
+  list_b = [TOOL_GET_WEATHER, TOOL_ROLL_DICE_6]
+  assert not TrajectoryEvaluator.are_tools_equal(list_a, list_b)
+
+
+def test_are_tools_equal_different_length():
+  """Tests are_tools_equal function with lists of different lengths."""
+  list_a = [TOOL_GET_WEATHER, TOOL_ROLL_DICE_6]
+  list_b = [TOOL_GET_WEATHER]
+  assert not TrajectoryEvaluator.are_tools_equal(list_a, list_b)
+
+
+def test_are_tools_equal_different_input_values():
+  """Tests are_tools_equal function with different input values."""
+  list_a = [TOOL_ROLL_DICE_16]
+  list_b = [TOOL_ROLL_DICE_6]
+  assert not TrajectoryEvaluator.are_tools_equal(list_a, list_b)
+
+
+def test_are_tools_equal_different_tool_names():
+  """Tests are_tools_equal function with different tool names."""
+  list_a = [TOOL_ROLL_DICE_16]
+  list_b = [TOOL_GET_WEATHER]
+  assert not TrajectoryEvaluator.are_tools_equal(list_a, list_b)
+
+
+def test_are_tools_equal_ignores_extra_keys():
+  """Tests are_tools_equal function ignores keys other than tool_name/tool_input."""
+  list_a = [{
+      "tool_name": "get_weather",
+      "tool_input": {"location": "Paris"},
+      "extra_key": "abc",
+  }]
+  list_b = [{
+      "tool_name": "get_weather",
+      "tool_input": {"location": "Paris"},
+      "other_key": 123,
+  }]
+  assert TrajectoryEvaluator.are_tools_equal(list_a, list_b)
+
+
+def test_are_tools_equal_one_empty_one_not():
+  """Tests are_tools_equal function with one empty list and one non-empty list."""
+  list_a = []
+  list_b = [TOOL_GET_WEATHER]
+  assert not TrajectoryEvaluator.are_tools_equal(list_a, list_b)