Files
evo-ai/.venv/lib/python3.10/site-packages/vertexai/preview/evaluation/constants.py
2025-04-25 15:30:54 -03:00

207 lines
6.8 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Constants for evaluation."""
import dataclasses
# The number of concurrent workers to use for making model inference and
# evaluation requests.
MAX_WORKERS = 100
RESPONSE_ERROR = "Error"
@dataclasses.dataclass(frozen=True)
class Metric:
"""Namespace for Metrics."""
# Model-based Pointwise Metrics.
COHERENCE = "coherence"
FLUENCY = "fluency"
SAFETY = "safety"
GROUNDEDNESS = "groundedness"
INSTRUCTION_FOLLOWING = "instruction_following"
VERBOSITY = "verbosity"
TEXT_QUALITY = "text_quality"
SUMMARIZATION_QUALITY = "summarization_quality"
QUESTION_ANSWERING_QUALITY = "question_answering_quality"
MULTI_TURN_CHAT_QUALITY = "multi_turn_chat_quality"
MULTI_TURN_SAFETY = "multi_turn_safety"
RUBRIC_BASED_INSTRUCTION_FOLLOWING = "rubric_based_instruction_following"
# Model-based Pairwise Metrics.
PAIRWISE_COHERENCE = "pairwise_coherence"
PAIRWISE_FLUENCY = "pairwise_fluency"
PAIRWISE_SAFETY = "pairwise_safety"
PAIRWISE_GROUNDEDNESS = "pairwise_groundedness"
PAIRWISE_INSTRUCTION_FOLLOWING = "pairwise_instruction_following"
PAIRWISE_VERBOSITY = "pairwise_verbosity"
PAIRWISE_TEXT_QUALITY = "pairwise_text_quality"
PAIRWISE_SUMMARIZATION_QUALITY = "pairwise_summarization_quality"
PAIRWISE_QUESTION_ANSWERING_QUALITY = "pairwise_question_answering_quality"
PAIRWISE_MULTI_TURN_CHAT_QUALITY = "pairwise_multi_turn_chat_quality"
PAIRWISE_MULTI_TURN_SAFETY = "pairwise_multi_turn_safety"
POINTWISE_METRIC = "pointwise_metric"
PAIRWISE_METRIC = "pairwise_metric"
# Automatic Metrics.
EXACT_MATCH = "exact_match"
BLEU = "bleu"
ROUGE = "rouge"
ROUGE_1 = "rouge_1"
ROUGE_2 = "rouge_2"
ROUGE_L = "rouge_l"
ROUGE_L_SUM = "rouge_l_sum"
TOOL_CALL_VALID = "tool_call_valid"
TOOL_NAME_MATCH = "tool_name_match"
TOOL_PARAMETER_KEY_MATCH = "tool_parameter_key_match"
TOOL_PARAMETER_KV_MATCH = "tool_parameter_kv_match"
TRAJECTORY_EXACT_MATCH = "trajectory_exact_match"
TRAJECTORY_IN_ORDER_MATCH = "trajectory_in_order_match"
TRAJECTORY_ANY_ORDER_MATCH = "trajectory_any_order_match"
TRAJECTORY_PRECISION = "trajectory_precision"
TRAJECTORY_RECALL = "trajectory_recall"
TRAJECTORY_SINGLE_TOOL_USE = "trajectory_single_tool_use"
LATENCY = "latency_in_seconds"
FAILURE = "failure"
AUTOMATIC_METRIC_LIST = (
EXACT_MATCH,
BLEU,
ROUGE,
ROUGE_1,
ROUGE_2,
ROUGE_L,
ROUGE_L_SUM,
TOOL_CALL_VALID,
TOOL_NAME_MATCH,
TOOL_PARAMETER_KEY_MATCH,
TOOL_PARAMETER_KV_MATCH,
)
TRAJECTORY_METRIC_LIST = (
TRAJECTORY_EXACT_MATCH,
TRAJECTORY_IN_ORDER_MATCH,
TRAJECTORY_ANY_ORDER_MATCH,
TRAJECTORY_PRECISION,
TRAJECTORY_RECALL,
TRAJECTORY_SINGLE_TOOL_USE,
)
DEFAULT_METRIC_LIST = (
LATENCY,
FAILURE,
)
POINTWISE_METRIC_PROMPT_TEMPLATE_EXAMPLE_LIST = (
COHERENCE,
FLUENCY,
SAFETY,
GROUNDEDNESS,
INSTRUCTION_FOLLOWING,
VERBOSITY,
TEXT_QUALITY,
SUMMARIZATION_QUALITY,
QUESTION_ANSWERING_QUALITY,
MULTI_TURN_CHAT_QUALITY,
MULTI_TURN_SAFETY,
)
PAIRWISE_METRIC_PROMPT_TEMPLATE_EXAMPLE_LIST = (
PAIRWISE_COHERENCE,
PAIRWISE_FLUENCY,
PAIRWISE_SAFETY,
PAIRWISE_GROUNDEDNESS,
PAIRWISE_INSTRUCTION_FOLLOWING,
PAIRWISE_VERBOSITY,
PAIRWISE_TEXT_QUALITY,
PAIRWISE_SUMMARIZATION_QUALITY,
PAIRWISE_QUESTION_ANSWERING_QUALITY,
PAIRWISE_MULTI_TURN_CHAT_QUALITY,
PAIRWISE_MULTI_TURN_SAFETY,
)
@dataclasses.dataclass(frozen=True)
class MetricResult:
ROW_COUNT_KEY = "row_count"
SCORE_KEY = "score"
EXPLANATION_KEY = "explanation"
CUSTOM_OUTPUT_KEY = "custom_output"
RAW_OUTPUT_KEY = "raw_output"
RAW_OUTPUTS_KEY = "raw_outputs"
PAIRWISE_CHOICE_KEY = "pairwise_choice"
IS_UNSAFE_KEY = "is_unsafe"
IS_UNSAFE_PROBABILITY_KEY = "is_unsafe_probability"
VIOLATED_POLICIES_KEY = "violated_policies"
RUBRIC_LEVEL_INSTRUCTION_FOLLOWING_KEY = "per_rubric_result"
# Automatic Metrics.
EXACT_MATCH_RESULTS = "exact_match_results"
BLEU_RESULTS = "bleu_results"
ROUGE_RESULTS = "rouge_results"
TOOL_CALL_VALID_RESULTS = "tool_call_valid_results"
TOOL_NAME_MATCH_RESULTS = "tool_name_match_results"
TOOL_PARAMETER_KEY_MATCH_RESULTS = "tool_parameter_key_match_results"
TOOL_PARAMETER_KV_MATCH_RESULTS = "tool_parameter_kv_match_results"
TRAJECTORY_EXACT_MATCH_RESULTS = "trajectory_exact_match_results"
TRAJECTORY_IN_ORDER_MATCH_RESULTS = "trajectory_in_order_match_results"
TRAJECTORY_ANY_ORDER_MATCH_RESULTS = "trajectory_any_order_match_results"
TRAJECTORY_PRECISION_RESULTS = "trajectory_precision_results"
TRAJECTORY_RECALL_RESULTS = "trajectory_recall_results"
TRAJECTORY_SINGLE_TOOL_USE_RESULTS = "trajectory_single_tool_use_results"
POINTWISE_METRIC_RESULT = "pointwise_metric_result"
PAIRWISE_METRIC_RESULT = "pairwise_metric_result"
RUBRIC_BASED_INSTRUCTION_FOLLOWING_RESULT = (
"rubric_based_instruction_following_result"
)
AUTOMATIC_METRIC_RESULTS_LIST = (
EXACT_MATCH_RESULTS,
BLEU_RESULTS,
ROUGE_RESULTS,
TOOL_CALL_VALID_RESULTS,
TOOL_NAME_MATCH_RESULTS,
TOOL_PARAMETER_KEY_MATCH_RESULTS,
TOOL_PARAMETER_KV_MATCH_RESULTS,
TRAJECTORY_EXACT_MATCH_RESULTS,
TRAJECTORY_IN_ORDER_MATCH_RESULTS,
TRAJECTORY_ANY_ORDER_MATCH_RESULTS,
TRAJECTORY_PRECISION_RESULTS,
TRAJECTORY_RECALL_RESULTS,
TRAJECTORY_SINGLE_TOOL_USE_RESULTS,
)
@dataclasses.dataclass(frozen=True)
class Dataset:
# Default evaluation dataset schema column names.
MODEL_RESPONSE_COLUMN = "response"
BASELINE_MODEL_RESPONSE_COLUMN = "baseline_model_response"
PROMPT_COLUMN = "prompt"
REFERENCE_COLUMN = "reference"
PREDICTED_TRAJECTORY_COLUMN = "predicted_trajectory"
REFERENCE_TRAJECTORY_COLUMN = "reference_trajectory"
RUBRICS_COLUMN = "rubrics"
@dataclasses.dataclass(frozen=True)
class QuotaLimit:
"""Generative AI on Vertex AI quota limits."""
# Default Evaluation Service QPS limit.
EVAL_SERVICE_QPS = 10