adk-python/src/google/adk/evaluation/response_evaluator.py

# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any

import pandas as pd
from tabulate import tabulate
from vertexai.preview.evaluation import EvalTask
from vertexai.preview.evaluation import MetricPromptTemplateExamples


class ResponseEvaluator:
  """Runs response evaluation for agents."""

  @staticmethod
  def evaluate(
      raw_eval_dataset: list[list[dict[str, Any]]],
      evaluation_criteria: list[str],
      *,
      print_detailed_results: bool = False
  ):
    r"""Returns the value of requested evaluation metrics.

    Args:
      raw_eval_dataset: The dataset that will be evaluated.
      evaluation_criteria: The evaluation criteria to be used. This method
        support two criteria, `response_evaluation_score` and
        `response_match_score`.
      print_detailed_results: Prints detailed results on the console. This is
        usually helpful during debugging.

    A note on evaluation_criteria:
      `response_match_score`: This metric compares the agents final natural
        language response with the expected final response, stored in the
        "reference" field in test/eval files. We use Rouge metric to compare the
        two responses.

        Value Range: [0, 1]. A score closer to 0 means poor similarity between
          response and reference. A score closer to 1 means strong similarity
          between response and reference.

      `response_evaluation_score`: Uses LLM to evalaute coherence of the
        response, including tool use. This is pointwise metric.

        Value range: [0, 5], where 0 means that the agent's response is not
        coherent, while 5 means it is . High values are good.
    A note on raw_eval_dataset:
      The dataset should be a list session, where each session is represented
      as a list of interaction that need evaluation. Each evaluation is
      represented as a dictionary that is expected to have values for the
      following keys:

        1) query
        2) response
        3) acutal_tool_use
        4) expected_tool_use
        5) reference

      Here is a sample eval_dataset value with one entry:
      [
        [
          {
            "query": "roll a die for me",
            "response": "I rolled a 16 sided die and got 13.\n",
            "expected_tool_use": [
              {
                "tool_name": "roll_die",
                "tool_input": {
                  "sides": 16
                }
              }
            ],
            "acutal_tool_use": [
              {
                "tool_name": "roll_die",
                "tool_input": {
                  "sides": 16
                }
              }
            ],
            "reference": "I rolled a 16 sided die and got 13.\n"
          }
        ]
      ]
    """
    if not raw_eval_dataset:
      raise ValueError("The evaluation dataset is empty.")

    metrics = ResponseEvaluator._get_metrics(
        raw_eval_dataset, evaluation_criteria
    )
    flattened_queries = [
        item for sublist in raw_eval_dataset for item in sublist
    ]
    eval_dataset = pd.DataFrame(flattened_queries).rename(
        columns={"query": "prompt", "expected_tool_use": "reference_trajectory"}
    )

    eval_result = ResponseEvaluator._perform_eval(
        dataset=eval_dataset, metrics=metrics
    )

    if print_detailed_results:
      ResponseEvaluator._print_results(eval_result)
    return eval_result.summary_metrics

  @staticmethod
  def _get_metrics(raw_eval_dataset, criteria):
    metrics = []
    if (
        "response_evaluation_score" in criteria
        and "query" in raw_eval_dataset[0][0]
        and "expected_tool_use" in raw_eval_dataset[0][0]
    ):
      metrics.append(MetricPromptTemplateExamples.Pointwise.COHERENCE)
    if (
        "response_match_score" in criteria
        and "reference" in raw_eval_dataset[0][0]
    ):
      metrics.append("rouge_1")
    return metrics

  @staticmethod
  def _perform_eval(dataset, metrics):
    """This method hides away the call to external service.

    Primarily helps with unit testing.
    """
    eval_task = EvalTask(dataset=dataset, metrics=metrics)

    return eval_task.evaluate()

  @staticmethod
  def _print_results(eval_result):
    print("Evaluation Summary Metrics:", eval_result.summary_metrics)
    print(tabulate(eval_result.metrics_table, headers="keys", tablefmt="grid"))