structure saas with tools
This commit is contained in:
@@ -0,0 +1,592 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2024 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
from typing import Any, Callable, Dict, List, Literal, Optional, TYPE_CHECKING, Union
|
||||
import uuid
|
||||
import warnings
|
||||
|
||||
from google.api_core import exceptions
|
||||
import vertexai
|
||||
from google.cloud.aiplatform import base
|
||||
from google.cloud.aiplatform import utils
|
||||
from google.cloud.aiplatform.metadata import metadata
|
||||
from vertexai import generative_models
|
||||
from vertexai.evaluation import _base as eval_base
|
||||
from vertexai.evaluation import _evaluation
|
||||
from vertexai.evaluation import constants
|
||||
from vertexai.evaluation import utils as eval_utils
|
||||
from vertexai.evaluation.metrics import (
|
||||
_base as metrics_base,
|
||||
)
|
||||
from vertexai.evaluation.metrics import (
|
||||
pairwise_metric,
|
||||
)
|
||||
from vertexai.evaluation.metrics import (
|
||||
pointwise_metric,
|
||||
)
|
||||
import numpy as np
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pandas as pd
|
||||
|
||||
# pylint: disable=g-import-not-at-top
|
||||
try:
|
||||
from IPython import display as IPython_display
|
||||
except ImportError:
|
||||
IPython_display = None
|
||||
|
||||
_LOGGER = base.Logger(__name__)
|
||||
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
EvalResult = eval_base.EvalResult
|
||||
GenerativeModel = generative_models.GenerativeModel
|
||||
|
||||
|
||||
class EvalTask:
|
||||
"""A class representing an EvalTask.
|
||||
|
||||
An evaluation task assesses the ability of a Gen AI model, agent or
|
||||
application to perform a specific task in response to prompts.
|
||||
Each evaluation task includes an evaluation dataset, which can be a set of
|
||||
test cases and a set of metrics for assessment. These tasks provide the
|
||||
framework for running evaluations in a standardized and repeatable way,
|
||||
allowing for comparative assessment with varying run-specific parameters.
|
||||
|
||||
|
||||
Dataset Details:
|
||||
|
||||
Default dataset column names:
|
||||
* prompt_column_name: "prompt"
|
||||
* reference_column_name: "reference"
|
||||
* response_column_name: "response"
|
||||
* baseline_model_response_column_name: "baseline_model_response"
|
||||
* rubrics_column_name: "rubrics"
|
||||
|
||||
|
||||
Requirement for different use cases:
|
||||
* Bring-your-own-response (BYOR): You already have the data that you
|
||||
want to evaluate stored in the dataset. Response column name can be
|
||||
customized by providing `response_column_name` parameter, or in the
|
||||
`metric_column_mapping`. For BYOR pairwise evaluation, the baseline
|
||||
model response column name can be customized by providing
|
||||
`baseline_model_response_column_name` parameter, or
|
||||
in the `metric_column_mapping`. If the `response` column or
|
||||
`baseline_model_response` column is present while the
|
||||
corresponding model is specified, an error will be raised.
|
||||
|
||||
* Perform model/agent inference without a prompt template: You have a dataset
|
||||
containing the input prompts to the model/agent and want to perform
|
||||
inference before evaluation. A column named `prompt` is required
|
||||
in the evaluation dataset and is used directly as input to the model/agent.
|
||||
|
||||
* Perform model/agent inference with a prompt template: You have a dataset
|
||||
containing the input variables to the prompt template and want to
|
||||
assemble the prompts for inference. Evaluation dataset
|
||||
must contain column names corresponding to the variable names in
|
||||
the prompt template. For example, if prompt template is
|
||||
"Instruction: {instruction}, context: {context}", the dataset must
|
||||
contain `instruction` and `context` columns.
|
||||
|
||||
Metrics Details:
|
||||
|
||||
The supported metrics descriptions, rating rubrics, and the required
|
||||
input variables can be found on the Vertex AI public documentation page.
|
||||
[Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval).
|
||||
|
||||
Usage Examples:
|
||||
|
||||
1. To perform bring-your-own-response(BYOR) evaluation, provide the model
|
||||
responses in the `response` column in the dataset. If a pairwise metric is
|
||||
used for BYOR evaluation, provide the baseline model responses in the
|
||||
`baseline_model_response` column.
|
||||
|
||||
```
|
||||
eval_dataset = pd.DataFrame({
|
||||
"prompt" : [...],
|
||||
"reference": [...],
|
||||
"response" : [...],
|
||||
"baseline_model_response": [...],
|
||||
})
|
||||
eval_task = EvalTask(
|
||||
dataset=eval_dataset,
|
||||
metrics=[
|
||||
"bleu",
|
||||
"rouge_l_sum",
|
||||
MetricPromptTemplateExamples.Pointwise.FLUENCY,
|
||||
MetricPromptTemplateExamples.Pairwise.SAFETY
|
||||
],
|
||||
experiment="my-experiment",
|
||||
)
|
||||
eval_result = eval_task.evaluate(experiment_run_name="eval-experiment-run")
|
||||
```
|
||||
|
||||
2. To perform evaluation with Gemini model inference, specify the `model`
|
||||
parameter with a `GenerativeModel` instance. The input column name to the
|
||||
model is `prompt` and must be present in the dataset.
|
||||
|
||||
```
|
||||
eval_dataset = pd.DataFrame({
|
||||
"reference": [...],
|
||||
"prompt" : [...],
|
||||
})
|
||||
result = EvalTask(
|
||||
dataset=eval_dataset,
|
||||
metrics=["exact_match", "bleu", "rouge_1", "rouge_l_sum"],
|
||||
experiment="my-experiment",
|
||||
).evaluate(
|
||||
model=GenerativeModel("gemini-1.5-pro"),
|
||||
experiment_run_name="gemini-eval-run"
|
||||
)
|
||||
```
|
||||
|
||||
3. If a `prompt_template` is specified, the `prompt` column is not required.
|
||||
Prompts can be assembled from the evaluation dataset, and all prompt
|
||||
template variable names must be present in the dataset columns.
|
||||
```
|
||||
eval_dataset = pd.DataFrame({
|
||||
"context" : [...],
|
||||
"instruction": [...],
|
||||
})
|
||||
result = EvalTask(
|
||||
dataset=eval_dataset,
|
||||
metrics=[MetricPromptTemplateExamples.Pointwise.SUMMARIZATION_QUALITY],
|
||||
).evaluate(
|
||||
model=GenerativeModel("gemini-1.5-pro"),
|
||||
prompt_template="{instruction}. Article: {context}. Summary:",
|
||||
)
|
||||
```
|
||||
|
||||
4. To perform evaluation with custom model inference, specify the `model`
|
||||
parameter with a custom inference function. The input column name to the
|
||||
custom inference function is `prompt` and must be present in the dataset.
|
||||
|
||||
```
|
||||
from openai import OpenAI
|
||||
client = OpenAI()
|
||||
def custom_model_fn(input: str) -> str:
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "user", "content": input}
|
||||
]
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
eval_dataset = pd.DataFrame({
|
||||
"prompt" : [...],
|
||||
"reference": [...],
|
||||
})
|
||||
result = EvalTask(
|
||||
dataset=eval_dataset,
|
||||
metrics=[MetricPromptTemplateExamples.Pointwise.SAFETY],
|
||||
experiment="my-experiment",
|
||||
).evaluate(
|
||||
model=custom_model_fn,
|
||||
experiment_run_name="gpt-eval-run"
|
||||
)
|
||||
```
|
||||
|
||||
5. To perform pairwise metric evaluation with model inference step, specify
|
||||
the `baseline_model` input to a `PairwiseMetric` instance and the candidate
|
||||
`model` input to the `EvalTask.evaluate()` function. The input column name
|
||||
to both models is `prompt` and must be present in the dataset.
|
||||
|
||||
```
|
||||
baseline_model = GenerativeModel("gemini-1.0-pro")
|
||||
candidate_model = GenerativeModel("gemini-1.5-pro")
|
||||
|
||||
pairwise_groundedness = PairwiseMetric(
|
||||
metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template(
|
||||
"pairwise_groundedness"
|
||||
),
|
||||
baseline_model=baseline_model,
|
||||
)
|
||||
eval_dataset = pd.DataFrame({
|
||||
"prompt" : [...],
|
||||
})
|
||||
result = EvalTask(
|
||||
dataset=eval_dataset,
|
||||
metrics=[pairwise_groundedness],
|
||||
experiment="my-pairwise-experiment",
|
||||
).evaluate(
|
||||
model=candidate_model,
|
||||
experiment_run_name="gemini-pairwise-eval-run",
|
||||
)
|
||||
```
|
||||
"""
|
||||
|
||||
_resource_noun = "evaluationTasks"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
dataset: Union["pd.DataFrame", str, Dict[str, Any]],
|
||||
metrics: List[
|
||||
Union[
|
||||
Literal[
|
||||
"exact_match",
|
||||
"bleu",
|
||||
"rouge_1",
|
||||
"rouge_2",
|
||||
"rouge_l",
|
||||
"rouge_l_sum",
|
||||
"tool_call_valid",
|
||||
"tool_name_match",
|
||||
"tool_parameter_key_match",
|
||||
"tool_parameter_kv_match",
|
||||
],
|
||||
metrics_base.CustomMetric,
|
||||
metrics_base._AutomaticMetric,
|
||||
metrics_base._TranslationMetric,
|
||||
pointwise_metric.PointwiseMetric,
|
||||
pairwise_metric.PairwiseMetric,
|
||||
]
|
||||
],
|
||||
experiment: Optional[str] = None,
|
||||
metric_column_mapping: Optional[Dict[str, str]] = None,
|
||||
output_uri_prefix: Optional[str] = "",
|
||||
):
|
||||
"""Initializes an EvalTask.
|
||||
|
||||
Args:
|
||||
dataset: The dataset to be evaluated.
|
||||
Supports the following dataset formats:
|
||||
* pandas.DataFrame: Used directly for evaluation.
|
||||
* Dict: Converted to a pandas DataFrame before evaluation.
|
||||
* str: Interpreted as a file path or URI. Supported formats include:
|
||||
* Local JSONL or CSV files: Loaded from the local filesystem.
|
||||
* GCS JSONL or CSV files: Loaded from Google Cloud Storage
|
||||
(e.g., 'gs://bucket/data.csv').
|
||||
* BigQuery table URI: Loaded from Google Cloud BigQuery
|
||||
(e.g., 'bq://project-id.dataset.table_name').
|
||||
metrics: The list of metric names, or Metric instances to evaluate.
|
||||
Prompt template is required for PairwiseMetric.
|
||||
experiment: The name of the experiment to log the evaluations to.
|
||||
metric_column_mapping: An optional dictionary column mapping that
|
||||
overrides the metric prompt template input variable names with
|
||||
mapped the evaluation dataset column names, used during evaluation.
|
||||
For example, if the input_variables of the metric prompt template
|
||||
are ["context", "reference"], the metric_column_mapping can be
|
||||
{
|
||||
"context": "news_context",
|
||||
"reference": "ground_truth",
|
||||
"response": "model_1_response"
|
||||
}
|
||||
if the dataset has columns "news_context", "ground_truth" and
|
||||
"model_1_response".
|
||||
output_uri_prefix: GCS location to store the metrics_table from
|
||||
evaluation results.
|
||||
"""
|
||||
self._raw_dataset = dataset
|
||||
self._dataset = eval_utils.load_dataset(dataset)
|
||||
self._metrics = metrics
|
||||
self._experiment = experiment
|
||||
self._metric_column_mapping = eval_utils.initialize_metric_column_mapping(
|
||||
metric_column_mapping, self._dataset
|
||||
)
|
||||
self.output_uri_prefix = output_uri_prefix
|
||||
|
||||
@property
|
||||
def dataset(self) -> "pd.DataFrame":
|
||||
"""Returns evaluation dataset."""
|
||||
return self._dataset
|
||||
|
||||
@property
|
||||
def metrics(self) -> List[Union[str, metrics_base.CustomMetric]]:
|
||||
"""Returns metrics."""
|
||||
return self._metrics
|
||||
|
||||
@property
|
||||
def experiment(self) -> Optional[str]:
|
||||
"""Returns experiment name."""
|
||||
return self._experiment
|
||||
|
||||
def _evaluate_with_experiment(
|
||||
self,
|
||||
*,
|
||||
model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None,
|
||||
prompt_template: Optional[str] = None,
|
||||
experiment_run_name: Optional[str] = None,
|
||||
evaluation_service_qps: Optional[float] = None,
|
||||
retry_timeout: float = 120.0,
|
||||
output_file_name: Optional[str] = None,
|
||||
) -> EvalResult:
|
||||
"""Runs an evaluation for the EvalTask with an experiment.
|
||||
|
||||
Args:
|
||||
model: A GenerativeModel instance or a custom model function to generate
|
||||
responses to evaluate. If not provided, the evaluation is computed with
|
||||
the `response` column in the `dataset`.
|
||||
prompt_template: The prompt template to use for the evaluation. If not
|
||||
set, the prompt template that was used to create the EvalTask will be
|
||||
used.
|
||||
experiment_run_name: The name of the experiment run to log the evaluation
|
||||
to if an experiment is set for this EvalTask. If not provided, a random
|
||||
unique experiment run name is used.
|
||||
evaluation_service_qps: The custom QPS limit for the evaluation service.
|
||||
retry_timeout: How long to keep retrying the evaluation requests for
|
||||
the whole evaluation dataset, in seconds.
|
||||
output_file_name: The file name with csv suffix to store the output
|
||||
metrics_table to be tracked in the experiment run.
|
||||
|
||||
Returns:
|
||||
The evaluation result.
|
||||
"""
|
||||
self._validate_experiment_run()
|
||||
with vertexai.preview.start_run(experiment_run_name):
|
||||
self._log_eval_experiment_param(
|
||||
model=model,
|
||||
prompt_template=prompt_template,
|
||||
output_file_name=output_file_name,
|
||||
)
|
||||
eval_result = _evaluation.evaluate(
|
||||
dataset=self._dataset,
|
||||
metrics=self._metrics,
|
||||
model=model,
|
||||
prompt_template=prompt_template,
|
||||
metric_column_mapping=self._metric_column_mapping,
|
||||
evaluation_service_qps=evaluation_service_qps,
|
||||
retry_timeout=retry_timeout,
|
||||
)
|
||||
|
||||
eval_result.summary_metrics = {
|
||||
k: ("NaN" if isinstance(v, float) and np.isnan(v) else v)
|
||||
for k, v in eval_result.summary_metrics.items()
|
||||
}
|
||||
eval_result.metadata = {
|
||||
"experiment": self._experiment,
|
||||
"experiment_run": experiment_run_name,
|
||||
}
|
||||
try:
|
||||
vertexai.preview.log_metrics(eval_result.summary_metrics)
|
||||
except (TypeError, exceptions.InvalidArgument) as e:
|
||||
_LOGGER.warning(f"Experiment metrics logging failed: {str(e)}")
|
||||
return eval_result
|
||||
|
||||
def evaluate(
|
||||
self,
|
||||
*,
|
||||
model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None,
|
||||
prompt_template: Optional[str] = None,
|
||||
experiment_run_name: Optional[str] = None,
|
||||
response_column_name: Optional[str] = None,
|
||||
baseline_model_response_column_name: Optional[str] = None,
|
||||
evaluation_service_qps: Optional[float] = None,
|
||||
retry_timeout: float = 120.0,
|
||||
output_file_name: Optional[str] = None,
|
||||
) -> EvalResult:
|
||||
"""Runs an evaluation for the EvalTask.
|
||||
|
||||
Args:
|
||||
model: A GenerativeModel instance or a custom model function to generate
|
||||
responses to evaluate. If not provided, the evaluation can be performed
|
||||
in the bring-your-own-response (BYOR) mode.
|
||||
prompt_template: The prompt template to use for the evaluation. If not
|
||||
set, the prompt template that was used to create the EvalTask will be
|
||||
used.
|
||||
experiment_run_name: The name of the experiment run to log the evaluation
|
||||
to if an experiment is set for this EvalTask. If not provided, a random
|
||||
unique experiment run name is used.
|
||||
response_column_name: The column name of model response in the dataset. If
|
||||
provided, this will override the `metric_column_mapping` of the `EvalTask`.
|
||||
baseline_model_response_column_name: The column name of baseline model
|
||||
response in the dataset for pairwise metrics. If provided, this will
|
||||
override the `metric_column_mapping` of the `EvalTask`
|
||||
evaluation_service_qps: The custom QPS limit for the evaluation service.
|
||||
retry_timeout: How long to keep retrying the evaluation requests for
|
||||
the whole evaluation dataset, in seconds.
|
||||
output_file_name: The file name with csv suffix to store the output
|
||||
metrics_table.
|
||||
|
||||
Returns:
|
||||
The evaluation result.
|
||||
"""
|
||||
global_experiment_name = metadata._experiment_tracker.experiment_name
|
||||
if experiment_run_name and not self._experiment and not global_experiment_name:
|
||||
raise ValueError(
|
||||
"Experiment is not set. Please initialize `EvalTask` with an"
|
||||
" experiment, or initialize a global experiment with "
|
||||
"`vertexai.init(experiment='experiment_name')`for logging this"
|
||||
" evaluation run."
|
||||
)
|
||||
if self.output_uri_prefix and not output_file_name:
|
||||
output_file_name = f"eval_results_{utils.timestamped_unique_name()}.csv"
|
||||
self._verify_and_set_response_column_name(
|
||||
response_column_name=response_column_name,
|
||||
metric_column_mapping_key=constants.Dataset.MODEL_RESPONSE_COLUMN,
|
||||
)
|
||||
self._verify_and_set_response_column_name(
|
||||
response_column_name=baseline_model_response_column_name,
|
||||
metric_column_mapping_key=constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
|
||||
)
|
||||
|
||||
experiment_run_name = experiment_run_name or f"{uuid.uuid4()}"
|
||||
if self._experiment and global_experiment_name:
|
||||
metadata._experiment_tracker.set_experiment(
|
||||
experiment=self._experiment, backing_tensorboard=False
|
||||
)
|
||||
eval_result = self._evaluate_with_experiment(
|
||||
model=model,
|
||||
prompt_template=prompt_template,
|
||||
experiment_run_name=experiment_run_name,
|
||||
evaluation_service_qps=evaluation_service_qps,
|
||||
retry_timeout=retry_timeout,
|
||||
output_file_name=output_file_name,
|
||||
)
|
||||
metadata._experiment_tracker.set_experiment(
|
||||
experiment=global_experiment_name,
|
||||
backing_tensorboard=False,
|
||||
display_button=False,
|
||||
)
|
||||
elif self._experiment and not global_experiment_name:
|
||||
metadata._experiment_tracker.set_experiment(
|
||||
experiment=self._experiment, backing_tensorboard=False
|
||||
)
|
||||
eval_result = self._evaluate_with_experiment(
|
||||
model=model,
|
||||
prompt_template=prompt_template,
|
||||
experiment_run_name=experiment_run_name,
|
||||
evaluation_service_qps=evaluation_service_qps,
|
||||
retry_timeout=retry_timeout,
|
||||
output_file_name=output_file_name,
|
||||
)
|
||||
metadata._experiment_tracker.reset()
|
||||
elif not self._experiment and global_experiment_name:
|
||||
eval_result = self._evaluate_with_experiment(
|
||||
model=model,
|
||||
prompt_template=prompt_template,
|
||||
experiment_run_name=experiment_run_name,
|
||||
evaluation_service_qps=evaluation_service_qps,
|
||||
retry_timeout=retry_timeout,
|
||||
output_file_name=output_file_name,
|
||||
)
|
||||
else:
|
||||
eval_result = _evaluation.evaluate(
|
||||
dataset=self.dataset,
|
||||
metrics=self.metrics,
|
||||
model=model,
|
||||
prompt_template=prompt_template,
|
||||
metric_column_mapping=self._metric_column_mapping,
|
||||
evaluation_service_qps=evaluation_service_qps,
|
||||
retry_timeout=retry_timeout,
|
||||
)
|
||||
|
||||
candidate_model_name = None
|
||||
if isinstance(model, generative_models.GenerativeModel):
|
||||
candidate_model_name = model._model_name
|
||||
|
||||
baseline_model_name = None
|
||||
pairwise_metrics = [
|
||||
metric
|
||||
for metric in self.metrics
|
||||
if isinstance(metric, pairwise_metric.PairwiseMetric)
|
||||
]
|
||||
if pairwise_metrics:
|
||||
# All pairwise metrics should have the same baseline model.
|
||||
baseline_model = pairwise_metrics[0].baseline_model
|
||||
if isinstance(baseline_model, generative_models.GenerativeModel):
|
||||
baseline_model_name = baseline_model._model_name
|
||||
|
||||
dataset_uri = None
|
||||
if isinstance(self._raw_dataset, str):
|
||||
dataset_uri = self._raw_dataset
|
||||
|
||||
eval_utils.upload_evaluation_results(
|
||||
eval_result,
|
||||
self.output_uri_prefix,
|
||||
output_file_name,
|
||||
candidate_model_name,
|
||||
baseline_model_name,
|
||||
dataset_uri,
|
||||
self.metrics,
|
||||
)
|
||||
return eval_result
|
||||
|
||||
def _validate_experiment_run(self) -> None:
|
||||
"""Checks if an experiment run already exists."""
|
||||
if metadata._experiment_tracker.experiment_run:
|
||||
raise ValueError(
|
||||
"Experiment run already exists. Please specify the name of the"
|
||||
" experiment run to assign current session within this evaluation."
|
||||
)
|
||||
|
||||
def _log_eval_experiment_param(
|
||||
self,
|
||||
model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None,
|
||||
prompt_template: Optional[str] = None,
|
||||
output_file_name: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Logs variable input parameters of an evaluation to an experiment run."""
|
||||
eval_metadata = {}
|
||||
|
||||
if prompt_template is not None:
|
||||
eval_metadata.update({"prompt_template": prompt_template})
|
||||
|
||||
if isinstance(model, GenerativeModel):
|
||||
eval_metadata.update(
|
||||
{
|
||||
"model_name": model._model_name,
|
||||
}
|
||||
)
|
||||
|
||||
if model._generation_config and isinstance(model._generation_config, dict):
|
||||
eval_metadata.update(**model._generation_config)
|
||||
|
||||
if model._safety_settings and isinstance(model._safety_settings, dict):
|
||||
safety_settings = model._safety_settings
|
||||
safety_settings_as_str = {
|
||||
category.name: threshold.name
|
||||
for category, threshold in safety_settings.items()
|
||||
}
|
||||
eval_metadata.update(safety_settings_as_str)
|
||||
|
||||
if self.output_uri_prefix and output_file_name:
|
||||
eval_metadata.update(
|
||||
{"output_file": self.output_uri_prefix + "/" + output_file_name}
|
||||
)
|
||||
|
||||
if eval_metadata:
|
||||
_LOGGER.info(f"Logging Eval Experiment metadata: {eval_metadata}")
|
||||
try:
|
||||
vertexai.preview.log_params(eval_metadata)
|
||||
except (ValueError, TypeError) as e:
|
||||
_LOGGER.warning(f"Experiment metadata logging failed: {str(e)}")
|
||||
|
||||
def _verify_and_set_response_column_name(
|
||||
self, response_column_name: str, metric_column_mapping_key: str
|
||||
) -> None:
|
||||
"""Verifies and sets the model response column names."""
|
||||
if response_column_name:
|
||||
if response_column_name in self._dataset.columns:
|
||||
self._metric_column_mapping[
|
||||
metric_column_mapping_key
|
||||
] = response_column_name
|
||||
else:
|
||||
raise ValueError(
|
||||
f"(Baseline) Model response column {response_column_name} is not"
|
||||
" found in the dataset."
|
||||
)
|
||||
|
||||
def display_runs(self):
|
||||
"""Displays experiment runs associated with this EvalTask."""
|
||||
if not self._experiment:
|
||||
raise ValueError("Experiment is not set.")
|
||||
elif IPython_display:
|
||||
IPython_display.display(
|
||||
vertexai.preview.get_experiment_df(self._experiment)
|
||||
)
|
||||
Reference in New Issue
Block a user