structure saas with tools

This commit is contained in:
Davidson Gomes
2025-04-25 15:30:54 -03:00
commit 1aef473937
16434 changed files with 6584257 additions and 0 deletions

View File

@@ -0,0 +1,182 @@
# -*- coding: utf-8 -*-
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""The vertexai resources module."""
from google.cloud.aiplatform import initializer
from google.cloud.aiplatform.datasets import (
ImageDataset,
TabularDataset,
TextDataset,
TimeSeriesDataset,
VideoDataset,
)
from google.cloud.aiplatform import explain
from google.cloud.aiplatform import gapic
from google.cloud.aiplatform import hyperparameter_tuning
from google.cloud.aiplatform.featurestore import (
EntityType,
Feature,
Featurestore,
)
from google.cloud.aiplatform.matching_engine import (
MatchingEngineIndex,
MatchingEngineIndexEndpoint,
)
from google.cloud.aiplatform import metadata
from google.cloud.aiplatform.tensorboard import uploader_tracker
from google.cloud.aiplatform.models import DeploymentResourcePool
from google.cloud.aiplatform.models import Endpoint
from google.cloud.aiplatform.models import PrivateEndpoint
from google.cloud.aiplatform.models import Model
from google.cloud.aiplatform.models import ModelRegistry
from google.cloud.aiplatform.model_evaluation import ModelEvaluation
from google.cloud.aiplatform.jobs import (
BatchPredictionJob,
CustomJob,
HyperparameterTuningJob,
ModelDeploymentMonitoringJob,
)
from google.cloud.aiplatform.pipeline_jobs import PipelineJob
from google.cloud.aiplatform.pipeline_job_schedules import (
PipelineJobSchedule,
)
from google.cloud.aiplatform.tensorboard import (
Tensorboard,
TensorboardExperiment,
TensorboardRun,
TensorboardTimeSeries,
)
from google.cloud.aiplatform.training_jobs import (
CustomTrainingJob,
CustomContainerTrainingJob,
CustomPythonPackageTrainingJob,
AutoMLTabularTrainingJob,
AutoMLForecastingTrainingJob,
SequenceToSequencePlusForecastingTrainingJob,
TemporalFusionTransformerForecastingTrainingJob,
TimeSeriesDenseEncoderForecastingTrainingJob,
AutoMLImageTrainingJob,
AutoMLTextTrainingJob,
AutoMLVideoTrainingJob,
)
from google.cloud.aiplatform import helpers
"""
Usage:
import vertexai
vertexai.init(project='my_project')
"""
init = initializer.global_config.init
get_pipeline_df = metadata.metadata._LegacyExperimentService.get_pipeline_df
log_params = metadata.metadata._experiment_tracker.log_params
log_metrics = metadata.metadata._experiment_tracker.log_metrics
log_classification_metrics = (
metadata.metadata._experiment_tracker.log_classification_metrics
)
log_model = metadata.metadata._experiment_tracker.log_model
get_experiment_df = metadata.metadata._experiment_tracker.get_experiment_df
start_run = metadata.metadata._experiment_tracker.start_run
autolog = metadata.metadata._experiment_tracker.autolog
start_execution = metadata.metadata._experiment_tracker.start_execution
log = metadata.metadata._experiment_tracker.log
log_time_series_metrics = metadata.metadata._experiment_tracker.log_time_series_metrics
end_run = metadata.metadata._experiment_tracker.end_run
upload_tb_log = uploader_tracker._tensorboard_tracker.upload_tb_log
start_upload_tb_log = uploader_tracker._tensorboard_tracker.start_upload_tb_log
end_upload_tb_log = uploader_tracker._tensorboard_tracker.end_upload_tb_log
save_model = metadata._models.save_model
get_experiment_model = metadata.schema.google.artifact_schema.ExperimentModel.get
Experiment = metadata.experiment_resources.Experiment
ExperimentRun = metadata.experiment_run_resource.ExperimentRun
Artifact = metadata.artifact.Artifact
Execution = metadata.execution.Execution
Context = metadata.context.Context
__all__ = (
"end_run",
"explain",
"gapic",
"init",
"helpers",
"hyperparameter_tuning",
"log",
"log_params",
"log_metrics",
"log_classification_metrics",
"log_model",
"log_time_series_metrics",
"get_experiment_df",
"get_pipeline_df",
"start_run",
"start_execution",
"save_model",
"get_experiment_model",
"autolog",
"upload_tb_log",
"start_upload_tb_log",
"end_upload_tb_log",
"Artifact",
"AutoMLImageTrainingJob",
"AutoMLTabularTrainingJob",
"AutoMLForecastingTrainingJob",
"AutoMLTextTrainingJob",
"AutoMLVideoTrainingJob",
"BatchPredictionJob",
"CustomJob",
"CustomTrainingJob",
"CustomContainerTrainingJob",
"CustomPythonPackageTrainingJob",
"DeploymentResourcePool",
"Endpoint",
"EntityType",
"Execution",
"Experiment",
"ExperimentRun",
"Feature",
"Featurestore",
"MatchingEngineIndex",
"MatchingEngineIndexEndpoint",
"ImageDataset",
"HyperparameterTuningJob",
"Model",
"ModelRegistry",
"ModelEvaluation",
"ModelDeploymentMonitoringJob",
"PipelineJob",
"PipelineJobSchedule",
"PrivateEndpoint",
"SequenceToSequencePlusForecastingTrainingJob",
"TabularDataset",
"Tensorboard",
"TensorboardExperiment",
"TensorboardRun",
"TensorboardTimeSeries",
"TextDataset",
"TemporalFusionTransformerForecastingTrainingJob",
"TimeSeriesDataset",
"TimeSeriesDenseEncoderForecastingTrainingJob",
"VideoDataset",
)

View File

@@ -0,0 +1,91 @@
# -*- coding: utf-8 -*-
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""The vertexai resources preview module."""
from google.cloud.aiplatform.preview.jobs import (
CustomJob,
HyperparameterTuningJob,
)
from google.cloud.aiplatform.preview.models import (
Prediction,
DeploymentResourcePool,
Endpoint,
Model,
)
from google.cloud.aiplatform.preview.featurestore.entity_type import (
EntityType,
)
from google.cloud.aiplatform.preview.persistent_resource import (
PersistentResource,
)
from google.cloud.aiplatform.preview.pipelinejobschedule.pipeline_job_schedules import (
PipelineJobSchedule,
)
from vertexai.resources.preview.feature_store import (
Feature,
FeatureGroup,
FeatureGroupBigQuerySource,
FeatureMonitor,
FeatureOnlineStore,
FeatureOnlineStoreType,
FeatureView,
FeatureViewBigQuerySource,
FeatureViewReadResponse,
FeatureViewRegistrySource,
FeatureViewVertexRagSource,
IndexConfig,
TreeAhConfig,
BruteForceConfig,
DistanceMeasureType,
AlgorithmConfig,
)
from vertexai.resources.preview.ml_monitoring import (
ModelMonitor,
ModelMonitoringJob,
)
__all__ = (
"CustomJob",
"HyperparameterTuningJob",
"Prediction",
"DeploymentResourcePool",
"Endpoint",
"Model",
"PersistentResource",
"EntityType",
"PipelineJobSchedule",
"Feature",
"FeatureGroup",
"FeatureGroupBigQuerySource",
"FeatureMonitor",
"FeatureOnlineStoreType",
"FeatureOnlineStore",
"FeatureView",
"FeatureViewBigQuerySource",
"FeatureViewReadResponse",
"FeatureViewVertexRagSource",
"FeatureViewRegistrySource",
"IndexConfig",
"TreeAhConfig",
"BruteForceConfig",
"DistanceMeasureType",
"AlgorithmConfig",
"ModelMonitor",
"ModelMonitoringJob",
)

View File

@@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""The vertexai resources preview module."""
from vertexai.resources.preview.feature_store.feature import (
Feature,
)
from vertexai.resources.preview.feature_store.feature_group import (
FeatureGroup,
)
from vertexai.resources.preview.feature_store.feature_monitor import (
FeatureMonitor,
)
from vertexai.resources.preview.feature_store.feature_online_store import (
FeatureOnlineStore,
FeatureOnlineStoreType,
)
from vertexai.resources.preview.feature_store.feature_view import (
FeatureView,
)
from vertexai.resources.preview.feature_store.utils import (
FeatureGroupBigQuerySource,
FeatureViewBigQuerySource,
FeatureViewReadResponse,
FeatureViewVertexRagSource,
FeatureViewRegistrySource,
IndexConfig,
TreeAhConfig,
BruteForceConfig,
DistanceMeasureType,
AlgorithmConfig,
)
__all__ = (
Feature,
FeatureGroup,
FeatureGroupBigQuerySource,
FeatureMonitor,
FeatureOnlineStoreType,
FeatureOnlineStore,
FeatureView,
FeatureViewBigQuerySource,
FeatureViewReadResponse,
FeatureViewVertexRagSource,
FeatureViewRegistrySource,
IndexConfig,
IndexConfig,
TreeAhConfig,
BruteForceConfig,
DistanceMeasureType,
AlgorithmConfig,
)

View File

@@ -0,0 +1,190 @@
# -*- coding: utf-8 -*-
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import textwrap
from dataclasses import dataclass
from typing import Optional, List
@dataclass
class DataSource:
"""An object to represent a data source - both entity DataFrame and any feature data.
Contains helpers for use with SQL templating.
"""
def __init__(
self,
qualifying_name: str,
sql: str,
data_columns: List[str],
timestamp_column: str,
entity_id_columns: Optional[List[str]] = None,
):
"""Initialize DataSource object.
Args:
qualifying_name:
A unique name used to qualify the data in the PITL query.
sql:
SQL query representing the data_source.
data_columns:
Columns other than entity ID column(s) and timestamp column.
timestamp_column:
The column that holds feature timestamp data.
entity_id_columns:
The column(s) that holds entity IDs. Shouldn't be populated for
entity_df.
"""
self.qualifying_name = qualifying_name
self._sql = sql
self.data_columns = data_columns
self.timestamp_column = timestamp_column
self.entity_id_columns = entity_id_columns
def copy_with_pitl_suffix(self) -> "DataSource":
import copy
data_source = copy.copy(self)
data_source.qualifying_name += "_pitl"
return data_source
@property
def sql(self):
return self._sql
@property
def comma_separated_qualified_data_columns(self):
return ", ".join(
[self.qualifying_name + "." + col for col in self.data_columns]
)
@property
def comma_separated_name_qualified_all_non_timestamp_columns(self):
"""Same as `comma_separated_qualified_data_columns` but including entity ID column."""
all_columns = self.data_columns.copy()
if self.entity_id_columns:
all_columns += self.entity_id_columns
return ", ".join([self.qualifying_name + "." + col for col in all_columns])
@property
def qualified_timestamp_column(self) -> str:
"""Returns name qualified timestamp column e.g. `name.feature_timestamp`."""
return f"{self.qualifying_name}.{self.timestamp_column}"
def _generate_eid_check(entity_data: DataSource, feature: DataSource):
"""Generate equality check for entity columns of feature against matching columns in entity_data."""
e_cols = set(entity_data.data_columns)
f_cols = feature.entity_id_columns
assert f_cols
equal_statements = []
for col in f_cols:
if col not in e_cols:
raise ValueError(
f"Feature entity ID column '{col}' should be a column in the entity DataFrame."
)
equal_statements.append(
f"{entity_data.qualifying_name}.{col} = {feature.qualifying_name}.{col}"
)
statement = " AND\n".join(equal_statements)
return statement
# Args:
# textwrap: Module
# generate_eid_check: function (above)
# entity_data: DataSource
# feature_data: List[DataSource]
_PITL_QUERY_TEMPLATE_RAW = """WITH
{{ entity_data.qualifying_name }}_without_row_num AS (
{{ textwrap.indent(entity_data.sql, ' ' * 4) }}
),
{{ entity_data.qualifying_name }} AS (
SELECT *, ROW_NUMBER() OVER() AS row_num,
FROM entity_df_without_row_num
),
# Features
{% for feature_data_elem in feature_data %}
{{ feature_data_elem.qualifying_name }} AS (
{{ textwrap.indent(feature_data_elem.sql, ' ' * 4) }}
),
{% endfor %}
# Features with PITL
{% for feature_data_elem in feature_data %}
{{ feature_data_elem.qualifying_name }}_pitl AS (
SELECT
{{ entity_data.qualifying_name }}.row_num,
{{ feature_data_elem.comma_separated_qualified_data_columns }},
FROM {{ entity_data.qualifying_name }}
LEFT JOIN {{ feature_data_elem.qualifying_name }}
ON (
{{ textwrap.indent(generate_eid_check(entity_data, feature_data_elem) + ' AND', ' ' * 6) }}
CAST({{ feature_data_elem.qualified_timestamp_column }} AS TIMESTAMP) <= CAST({{ entity_data.qualified_timestamp_column }} AS TIMESTAMP)
)
QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ entity_data.qualifying_name }}.row_num ORDER BY {{ feature_data_elem.qualified_timestamp_column }} DESC) = 1
){{ ',' if not loop.last else '' }}
{% endfor %}
SELECT
{{ entity_data.comma_separated_name_qualified_all_non_timestamp_columns }},
{% for feature_data_elem in feature_data %}
{% set feature_pitl = feature_data_elem.copy_with_pitl_suffix() %}
{{ feature_pitl.comma_separated_qualified_data_columns }},
{% endfor %}
{{ entity_data.qualified_timestamp_column }}
FROM {{ entity_data.qualifying_name }}
{% for feature_data_elem in feature_data %}
JOIN {{ feature_data_elem.qualifying_name }}_pitl USING (row_num)
{% endfor %}
"""
def pitl_query_template():
try:
import jinja2
except ImportError as exc:
raise ImportError(
"`Jinja2` is not installed but required for this functionality."
) from exc
return jinja2.Environment(
loader=jinja2.BaseLoader, lstrip_blocks=True, trim_blocks=True
).from_string(_PITL_QUERY_TEMPLATE_RAW)
def render_pitl_query(entity_data: DataSource, feature_data: List[DataSource]):
"""Return the PITL query jinja template.
The args for the query are as follows:
textwrap: The python textwrap module.
entity_data[DataSource]: The entity data(frame) as SQL source.
feature_data[List[DataSource]]:
"""
return pitl_query_template().render(
textwrap=textwrap,
generate_eid_check=_generate_eid_check,
entity_data=entity_data,
feature_data=feature_data,
)

View File

@@ -0,0 +1,151 @@
# -*- coding: utf-8 -*-
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
from typing import List, Optional
from google.auth import credentials as auth_credentials
from google.cloud.aiplatform import base
from google.cloud.aiplatform import utils
from google.cloud.aiplatform.compat.types import (
feature as gca_feature,
feature_monitor_v1beta1 as gca_feature_monitor,
feature_v1beta1 as gca_feature_v1beta1,
featurestore_service_v1beta1 as gca_featurestore_service_v1beta1,
)
class Feature(base.VertexAiResourceNounWithFutureManager):
"""Class for managing Feature resources."""
client_class = utils.FeatureRegistryClientWithOverride
_resource_noun = "features"
_getter_method = "get_feature"
_list_method = "list_features"
_delete_method = "delete_feature"
_parse_resource_name_method = "parse_feature_path"
_format_resource_name_method = "feature_path"
_gca_resource: gca_feature.Feature
def __init__(
self,
name: str,
feature_group_id: Optional[str] = None,
project: Optional[str] = None,
location: Optional[str] = None,
latest_stats_count: Optional[int] = None,
credentials: Optional[auth_credentials.Credentials] = None,
):
"""Retrieves an existing managed feature.
Args:
name:
The resource name
(`projects/.../locations/.../featureGroups/.../features/...`) or
ID.
feature_group_id:
The feature group ID. Must be passed in if name is an ID and not
a resource path.
project:
Project to retrieve feature from. If not set, the project set in
aiplatform.init will be used.
location:
Location to retrieve feature from. If not set, the location set
in aiplatform.init will be used.
gca_feature_arg:
The GCA feature object.
Only set when calling from get_feature with latest_stats_count set.
credentials:
Custom credentials to use to retrieve this feature. Overrides
credentials set in aiplatform.init.
"""
super().__init__(
project=project,
location=location,
credentials=credentials,
resource_name=name,
)
if re.fullmatch(
r"projects/.+/locations/.+/featureGroups/.+/features/.+",
name,
):
if feature_group_id:
raise ValueError(
f"Since feature '{name}' is provided as a path, feature_group_id should not be specified."
)
feature = name
else:
from .feature_group import FeatureGroup
# Construct the feature path using feature group ID if only the
# feature group ID is provided.
if not feature_group_id:
raise ValueError(
f"Since feature '{name}' is not provided as a path, please specify feature_group_id."
)
feature_group_path = utils.full_resource_name(
resource_name=feature_group_id,
resource_noun=FeatureGroup._resource_noun,
parse_resource_name_method=FeatureGroup._parse_resource_name,
format_resource_name_method=FeatureGroup._format_resource_name,
)
feature = f"{feature_group_path}/features/{name}"
if latest_stats_count is not None:
api_client = self.__class__._instantiate_client(
location=location, credentials=credentials
)
feature_obj: gca_feature_v1beta1.Feature = api_client.select_version(
"v1beta1"
).get_feature(
request=gca_featurestore_service_v1beta1.GetFeatureRequest(
name=f"{feature}",
feature_stats_and_anomaly_spec=gca_feature_monitor.FeatureStatsAndAnomalySpec(
latest_stats_count=latest_stats_count
),
)
)
self._gca_resource = feature_obj
else:
self._gca_resource = self._get_gca_resource(resource_name=feature)
@property
def version_column_name(self) -> str:
"""The name of the BigQuery Table/View column hosting data for this version."""
return self._gca_resource.version_column_name
@property
def description(self) -> str:
"""The description of the feature."""
return self._gca_resource.description
@property
def point_of_contact(self) -> str:
"""The point of contact for the feature."""
return self._gca_resource.point_of_contact
@property
def feature_stats_and_anomalies(
self,
) -> List[gca_feature_monitor.FeatureStatsAndAnomaly]:
"""The number of latest stats to return. Only present when gca_feature is set."""
return self._gca_resource.feature_stats_and_anomaly

View File

@@ -0,0 +1,592 @@
# -*- coding: utf-8 -*-
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Dict, List, Optional, Sequence, Tuple
from google.auth import credentials as auth_credentials
from google.cloud.aiplatform import base, initializer
from google.cloud.aiplatform import utils
from google.cloud.aiplatform.compat.types import (
feature as gca_feature,
feature_group as gca_feature_group,
io as gca_io,
feature_monitor_v1beta1 as gca_feature_monitor,
)
from vertexai.resources.preview.feature_store.utils import (
FeatureGroupBigQuerySource,
)
from vertexai.resources.preview.feature_store import (
Feature,
)
from vertexai.resources.preview.feature_store.feature_monitor import (
FeatureMonitor,
)
_LOGGER = base.Logger(__name__)
class FeatureGroup(base.VertexAiResourceNounWithFutureManager):
"""Class for managing Feature Group resources."""
client_class = utils.FeatureRegistryClientWithOverride
_resource_noun = "feature_groups"
_getter_method = "get_feature_group"
_list_method = "list_feature_groups"
_delete_method = "delete_feature_group"
_parse_resource_name_method = "parse_feature_group_path"
_format_resource_name_method = "feature_group_path"
_gca_resource: gca_feature_group.FeatureGroup
def __init__(
self,
name: str,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
):
"""Retrieves an existing managed feature group.
Args:
name:
The resource name
(`projects/.../locations/.../featureGroups/...`) or ID.
project:
Project to retrieve feature group from. If unset, the
project set in aiplatform.init will be used.
location:
Location to retrieve feature group from. If not set,
location set in aiplatform.init will be used.
credentials:
Custom credentials to use to retrieve this feature group.
Overrides credentials set in aiplatform.init.
"""
super().__init__(
project=project,
location=location,
credentials=credentials,
resource_name=name,
)
self._gca_resource = self._get_gca_resource(resource_name=name)
@classmethod
def create(
cls,
name: str,
source: FeatureGroupBigQuerySource = None,
labels: Optional[Dict[str, str]] = None,
description: Optional[str] = None,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
request_metadata: Optional[Sequence[Tuple[str, str]]] = None,
create_request_timeout: Optional[float] = None,
sync: bool = True,
) -> "FeatureGroup":
"""Creates a new feature group.
Args:
name: The name of the feature group.
source: The BigQuery source of the feature group.
labels:
The labels with user-defined metadata to organize your
FeatureGroup.
Label keys and values can be no longer than 64
characters (Unicode codepoints), can only
contain lowercase letters, numeric characters,
underscores and dashes. International characters
are allowed.
See https://goo.gl/xmQnxf for more information
on and examples of labels. No more than 64 user
labels can be associated with one
FeatureGroup(System labels are excluded)."
System reserved label keys are prefixed with
"aiplatform.googleapis.com/" and are immutable.
description: Description of the FeatureGroup.
project:
Project to create feature group in. If unset, the project set in
aiplatform.init will be used.
location:
Location to create feature group in. If not set, location set in
aiplatform.init will be used.
credentials:
Custom credentials to use to create this feature group.
Overrides credentials set in aiplatform.init.
request_metadata:
Strings which should be sent along with the request as metadata.
create_request_timeout:
The timeout for the create request in seconds.
sync:
Whether to execute this creation synchronously. If False, this
method will be executed in concurrent Future and any downstream
object will be immediately returned and synced when the Future
has completed.
Returns:
FeatureGroup - the FeatureGroup resource object.
"""
if not source:
raise ValueError("Please specify a valid source.")
# Only BigQuery source is supported right now.
if not isinstance(source, FeatureGroupBigQuerySource):
raise ValueError("Only FeatureGroupBigQuerySource is a supported source.")
# BigQuery source validation.
if not source.uri:
raise ValueError("Please specify URI in BigQuery source.")
if not source.entity_id_columns:
_LOGGER.info(
"No entity ID columns specified in BigQuery source. Defaulting to ['entity_id']."
)
entity_id_columns = ["entity_id"]
else:
entity_id_columns = source.entity_id_columns
gapic_feature_group = gca_feature_group.FeatureGroup(
big_query=gca_feature_group.FeatureGroup.BigQuery(
big_query_source=gca_io.BigQuerySource(input_uri=source.uri),
entity_id_columns=entity_id_columns,
),
name=name,
description=description,
)
if labels:
utils.validate_labels(labels)
gapic_feature_group.labels = labels
if request_metadata is None:
request_metadata = ()
api_client = cls._instantiate_client(location=location, credentials=credentials)
create_feature_group_lro = api_client.create_feature_group(
parent=initializer.global_config.common_location_path(
project=project, location=location
),
feature_group=gapic_feature_group,
feature_group_id=name,
metadata=request_metadata,
timeout=create_request_timeout,
)
_LOGGER.log_create_with_lro(cls, create_feature_group_lro)
created_feature_group = create_feature_group_lro.result()
_LOGGER.log_create_complete(cls, created_feature_group, "feature_group")
feature_group_obj = cls(
name=created_feature_group.name,
project=project,
location=location,
credentials=credentials,
)
return feature_group_obj
@base.optional_sync()
def delete(self, force: bool = False, sync: bool = True) -> None:
"""Deletes this feature group.
WARNING: This deletion is permanent.
Args:
force:
If set to True, all features under this online store will be
deleted prior to online store deletion. Otherwise, deletion
will only succeed if the online store has no FeatureViews.
If set to true, any Features under this FeatureGroup will also
be deleted. (Otherwise, the request will only work if the
FeatureGroup has no Features.)
sync:
Whether to execute this deletion synchronously. If False, this
method will be executed in concurrent Future and any downstream
object will be immediately returned and synced when the Future
has completed.
"""
lro = getattr(self.api_client, self._delete_method)(
name=self.resource_name,
force=force,
)
_LOGGER.log_delete_with_lro(self, lro)
lro.result()
_LOGGER.log_delete_complete(self)
def get_feature(
self,
feature_id: str,
latest_stats_count: Optional[int] = None,
credentials: Optional[auth_credentials.Credentials] = None,
) -> Feature:
"""Retrieves an existing managed feature.
Args:
feature_id: The ID of the feature.
latest_stats_count:
The number of latest stats to retrieve. Only returns stats if
Feature Monitor is created, and historical stats were generated.
credentials:
Custom credentials to use to retrieve the feature under this
feature group. The order of which credentials are used is as
follows: (1) this parameter (2) credentials passed to FeatureGroup
constructor (3) credentials set in aiplatform.init.
Returns:
Feature - the Feature resource object under this feature group.
"""
credentials = (
credentials or self.credentials or initializer.global_config.credentials
)
if latest_stats_count is not None:
return Feature(
name=f"{self.resource_name}/features/{feature_id}",
latest_stats_count=latest_stats_count,
credentials=credentials,
)
return Feature(
f"{self.resource_name}/features/{feature_id}", credentials=credentials
)
def create_feature(
self,
name: str,
version_column_name: Optional[str] = None,
description: Optional[str] = None,
labels: Optional[Dict[str, str]] = None,
point_of_contact: Optional[str] = None,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
request_metadata: Optional[Sequence[Tuple[str, str]]] = None,
create_request_timeout: Optional[float] = None,
sync: bool = True,
) -> Feature:
"""Creates a new feature.
Args:
name: The name of the feature.
version_column_name:
The name of the BigQuery Table/View column hosting data for this
version. If no value is provided, will use feature_id.
description: Description of the feature.
labels:
The labels with user-defined metadata to organize your Features.
Label keys and values can be no longer than 64 characters
(Unicode codepoints), can only contain lowercase letters,
numeric characters, underscores and dashes. International
characters are allowed.
See https://goo.gl/xmQnxf for more information on and examples
of labels. No more than 64 user labels can be associated with
one Feature (System labels are excluded)." System reserved label
keys are prefixed with "aiplatform.googleapis.com/" and are
immutable.
point_of_contact:
Entity responsible for maintaining this feature. Can be comma
separated list of email addresses or URIs.
project:
Project to create feature in. If unset, the project set in
aiplatform.init will be used.
location:
Location to create feature in. If not set, location set in
aiplatform.init will be used.
credentials:
Custom credentials to use to create this feature. Overrides
credentials set in aiplatform.init.
request_metadata:
Strings which should be sent along with the request as metadata.
create_request_timeout:
The timeout for the create request in seconds.
sync:
Whether to execute this creation synchronously. If False, this
method will be executed in concurrent Future and any downstream
object will be immediately returned and synced when the Future
has completed.
Returns:
Feature - the Feature resource object.
"""
gapic_feature = gca_feature.Feature()
if version_column_name:
gapic_feature.version_column_name = version_column_name
if description:
gapic_feature.description = description
if labels:
utils.validate_labels(labels)
gapic_feature.labels = labels
if point_of_contact:
gapic_feature.point_of_contact = point_of_contact
if request_metadata is None:
request_metadata = ()
api_client = self.__class__._instantiate_client(
location=location, credentials=credentials
)
create_feature_lro = api_client.create_feature(
parent=self.resource_name,
feature=gapic_feature,
feature_id=name,
metadata=request_metadata,
timeout=create_request_timeout,
)
_LOGGER.log_create_with_lro(Feature, create_feature_lro)
created_feature = create_feature_lro.result()
_LOGGER.log_create_complete(Feature, created_feature, "feature")
feature_obj = Feature(
name=created_feature.name,
project=project,
location=location,
credentials=credentials,
)
return feature_obj
def list_features(
self,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
) -> List[Feature]:
"""Lists features under this feature group.
Args:
project:
Project to list features in. If unset, the project set in
aiplatform.init will be used.
location:
Location to list features in. If not set, location set in
aiplatform.init will be used.
credentials:
Custom credentials to use to list features. Overrides
credentials set in aiplatform.init.
Returns:
List of features under this feature group.
"""
return Feature.list(
parent=self.resource_name,
project=project,
location=location,
credentials=credentials,
)
def get_feature_monitor(
self,
feature_monitor_id: str,
credentials: Optional[auth_credentials.Credentials] = None,
) -> FeatureMonitor:
"""Retrieves an existing feature monitor.
Args:
feature_monitor_id: The ID of the feature monitor.
credentials:
Custom credentials to use to retrieve the feature monitor under this
feature group. The order of which credentials are used is as
follows: (1) this parameter (2) credentials passed to FeatureGroup
constructor (3) credentials set in aiplatform.init.
Returns:
FeatureMonitor - the Feature Monitor resource object under this
feature group.
"""
credentials = (
credentials or self.credentials or initializer.global_config.credentials
)
return FeatureMonitor(
f"{self.resource_name}/featureMonitors/{feature_monitor_id}",
credentials=credentials,
)
def create_feature_monitor(
self,
name: str,
description: Optional[str] = None,
labels: Optional[Dict[str, str]] = None,
schedule_config: Optional[str] = None,
feature_selection_configs: Optional[List[Tuple[str, float]]] = None,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
request_metadata: Optional[Sequence[Tuple[str, str]]] = None,
create_request_timeout: Optional[float] = None,
) -> FeatureMonitor:
"""Creates a new feature monitor.
Args:
name: The name of the feature monitor.
description: Description of the feature monitor.
labels:
The labels with user-defined metadata to organize your FeatureMonitors.
Label keys and values can be no longer than 64 characters
(Unicode codepoints), can only contain lowercase letters,
numeric characters, underscores and dashes. International
characters are allowed.
See https://goo.gl/xmQnxf for more information on and examples
of labels. No more than 64 user labels can be associated with
one FeatureMonitor (System labels are excluded)." System reserved label
keys are prefixed with "aiplatform.googleapis.com/" and are
immutable.
schedule_config:
Configures when data is to be monitored for this
FeatureMonitor. At the end of the scheduled time,
the stats and drift are generated for the selected features.
Example format: "TZ=America/New_York 0 9 * * *" (monitors
daily at 9 AM EST).
feature_selection_configs:
List of tuples of feature id and monitoring threshold. If unset,
all features in the feature group will be monitored, and the
default thresholds 0.3 will be used.
project:
Project to create feature in. If unset, the project set in
aiplatform.init will be used.
location:
Location to create feature in. If not set, location set in
aiplatform.init will be used.
credentials:
Custom credentials to use to create this feature. Overrides
credentials set in aiplatform.init.
request_metadata:
Strings which should be sent along with the request as metadata.
create_request_timeout:
The timeout for the create request in seconds.
Returns:
FeatureMonitor - the FeatureMonitor resource object.
"""
gapic_feature_monitor = gca_feature_monitor.FeatureMonitor()
if description:
gapic_feature_monitor.description = description
if labels:
utils.validate_labels(labels)
gapic_feature_monitor.labels = labels
if request_metadata is None:
request_metadata = ()
if schedule_config:
gapic_feature_monitor.schedule_config = gca_feature_monitor.ScheduleConfig(
cron=schedule_config
)
if feature_selection_configs is None:
raise ValueError(
"Please specify feature_configs: features to be monitored and"
" their thresholds."
)
if feature_selection_configs is not None:
gapic_feature_monitor.feature_selection_config.feature_configs = [
gca_feature_monitor.FeatureSelectionConfig.FeatureConfig(
feature_id=feature_id,
drift_threshold=threshold if threshold else 0.3,
)
for feature_id, threshold in feature_selection_configs
]
api_client = self.__class__._instantiate_client(
location=location, credentials=credentials
)
create_feature_monitor_lro = api_client.select_version(
"v1beta1"
).create_feature_monitor(
parent=self.resource_name,
feature_monitor=gapic_feature_monitor,
feature_monitor_id=name,
metadata=request_metadata,
timeout=create_request_timeout,
)
_LOGGER.log_create_with_lro(FeatureMonitor, create_feature_monitor_lro)
created_feature_monitor = create_feature_monitor_lro.result()
_LOGGER.log_create_complete(
FeatureMonitor, created_feature_monitor, "feature_monitor"
)
feature_monitor_obj = FeatureMonitor(
name=created_feature_monitor.name,
project=project,
location=location,
credentials=credentials,
)
return feature_monitor_obj
def list_feature_monitors(
self,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
) -> List[FeatureMonitor]:
"""Lists features monitors under this feature group.
Args:
project:
Project to list feature monitors in. If unset, the project set in
aiplatform.init will be used.
location:
Location to list feature monitors in. If not set, location set in
aiplatform.init will be used.
credentials:
Custom credentials to use to list feature monitors. Overrides
credentials set in aiplatform.init.
Returns:
List of feature monitors under this feature group.
"""
return FeatureMonitor.list(
parent=self.resource_name,
project=project,
location=location,
credentials=credentials,
)
@property
def source(self) -> FeatureGroupBigQuerySource:
return FeatureGroupBigQuerySource(
uri=self._gca_resource.big_query.big_query_source.input_uri,
entity_id_columns=self._gca_resource.big_query.entity_id_columns,
)

View File

@@ -0,0 +1,335 @@
# -*- coding: utf-8 -*-
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
from typing import List, Dict, Optional, Tuple, Sequence
from google.auth import credentials as auth_credentials
from google.cloud.aiplatform import base, initializer
from google.cloud.aiplatform import utils
from google.cloud.aiplatform.compat.types import (
feature_monitor_v1beta1 as gca_feature_monitor,
feature_monitor_job_v1beta1 as gca_feature_monitor_job,
)
_LOGGER = base.Logger(__name__)
class FeatureMonitor(base.VertexAiResourceNounWithFutureManager):
"""Class for managing Feature Monitor resources."""
client_class = utils.FeatureRegistryClientV1Beta1WithOverride
_resource_noun = "feature_monitors"
_getter_method = "get_feature_monitor"
_list_method = "list_feature_monitors"
_delete_method = "delete_feature_monitor"
_parse_resource_name_method = "parse_feature_monitor_path"
_format_resource_name_method = "feature_monitor_path"
_gca_resource: gca_feature_monitor.FeatureMonitor
def __init__(
self,
name: str,
feature_group_id: Optional[str] = None,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
):
"""Retrieves an existing managed feature.
Args:
name:
The resource name
(`projects/.../locations/.../featureGroups/.../featureMonitors/...`) or
ID.
feature_group_id:
The feature group ID. Must be passed in if name is an ID and not
a resource path.
project:
Project to retrieve feature from. If not set, the project set in
aiplatform.init will be used.
location:
Location to retrieve feature from. If not set, the location set
in aiplatform.init will be used.
credentials:
Custom credentials to use to retrieve this feature. Overrides
credentials set in aiplatform.init.
"""
super().__init__(
project=project,
location=location,
credentials=credentials,
resource_name=name,
)
if re.fullmatch(
r"projects/.+/locations/.+/featureGroups/.+/featureMonitors/.+",
name,
):
if feature_group_id:
raise ValueError(
f"Since feature monitor '{name}' is provided as a path, feature_group_id should not be specified."
)
feature_monitor = name
else:
from .feature_group import FeatureGroup
# Construct the feature path using feature group ID if only the
# feature group ID is provided.
if not feature_group_id:
raise ValueError(
f"Since feature monitor '{name}' is not provided as a path, please specify feature_group_id."
)
feature_group_path = utils.full_resource_name(
resource_name=feature_group_id,
resource_noun=FeatureGroup._resource_noun,
parse_resource_name_method=FeatureGroup._parse_resource_name,
format_resource_name_method=FeatureGroup._format_resource_name,
)
feature_monitor = f"{feature_group_path}/featureMonitors/{name}"
self._gca_resource = self._get_gca_resource(resource_name=feature_monitor)
@property
def description(self) -> str:
"""The description of the feature monitor."""
return self._gca_resource.description
@property
def schedule_config(self) -> str:
"""The schedule config of the feature monitor."""
return self._gca_resource.schedule_config.cron
@property
def feature_selection_configs(self) -> List[Tuple[str, float]]:
"""The feature and it's drift threshold configs of the feature monitor."""
configs: List[Tuple[str, float]] = []
for (
feature_config
) in self._gca_resource.feature_selection_config.feature_configs:
configs.append(
(
feature_config.feature_id,
feature_config.drift_threshold
if feature_config.drift_threshold
else 0.3,
)
)
return configs
class FeatureMonitorJob(base.VertexAiResourceNounWithFutureManager):
"""Class for managing Feature Monitor Job resources."""
client_class = utils.FeatureRegistryClientV1Beta1WithOverride
_resource_noun = "featureMonitorJobs"
_getter_method = "get_feature_monitor_job"
_list_method = "list_feature_monitor_jobs"
_delete_method = "delete_feature_monitor_job"
_parse_resource_name_method = "parse_feature_monitor_job_path"
_format_resource_name_method = "feature_monitor_job_path"
_gca_resource: gca_feature_monitor_job.FeatureMonitorJob
def __init__(
self,
name: str,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
):
"""Retrieves an existing managed feature monitor job.
Args:
name: The resource name
(`projects/.../locations/.../featureGroups/.../featureMonitors/.../featureMonitorJobs/...`)
project: Project to retrieve the feature monitor job from. If
unset, the project set in aiplatform.init will be used.
location: Location to retrieve the feature monitor job from. If
not set, location set in aiplatform.init will be used.
credentials: Custom credentials to use to retrieve this feature
monitor job. Overrides credentials set in aiplatform.init.
"""
super().__init__(
project=project,
location=location,
credentials=credentials,
resource_name=name,
)
if not re.fullmatch(
r"projects/.+/locations/.+/featureGroups/.+/featureMonitors/.+/featureMonitorJobs/.+",
name,
):
raise ValueError(
"name need to specify the fully qualified"
+ " feature monitor job resource path."
)
self._gca_resource = self._get_gca_resource(resource_name=name)
@property
def description(self) -> str:
"""The description of the feature monitor."""
return self._gca_resource.description
@property
def feature_stats_and_anomalies(
self,
) -> List[gca_feature_monitor.FeatureStatsAndAnomaly]:
"""The feature stats and anomaly of the feature monitor job."""
if self._gca_resource.job_summary:
return self._gca_resource.job_summary.feature_stats_and_anomalies
return []
def create_feature_monitor_job(
self,
description: Optional[str] = None,
labels: Optional[Dict[str, str]] = None,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
request_metadata: Optional[Sequence[Tuple[str, str]]] = None,
create_request_timeout: Optional[float] = None,
) -> FeatureMonitorJob:
"""Creates a new feature monitor job.
Args:
description: Description of the feature monitor job.
labels:
The labels with user-defined metadata to organize your
FeatureMonitorJobs.
Label keys and values can be no longer than 64 characters
(Unicode codepoints), can only contain lowercase letters,
numeric characters, underscores and dashes. International
characters are allowed.
See https://goo.gl/xmQnxf for more information on and examples
of labels. No more than 64 user labels can be associated with
one FeatureMonitor (System labels are excluded)." System reserved label
keys are prefixed with "aiplatform.googleapis.com/" and are
immutable.
project:
Project to create feature in. If unset, the project set in
aiplatform.init will be used.
location:
Location to create feature in. If not set, location set in
aiplatform.init will be used.
credentials:
Custom credentials to use to create this feature. Overrides
credentials set in aiplatform.init.
request_metadata:
Strings which should be sent along with the request as metadata.
create_request_timeout:
The timeout for the create request in seconds.
Returns:
FeatureMonitorJob - the FeatureMonitorJob resource object.
"""
gapic_feature_monitor_job = gca_feature_monitor_job.FeatureMonitorJob()
if description:
gapic_feature_monitor_job.description = description
if labels:
utils.validate_labels(labels)
gapic_feature_monitor_job.labels = labels
if request_metadata is None:
request_metadata = ()
api_client = self.__class__._instantiate_client(
location=location, credentials=credentials
)
created_feature_monitor_job = api_client.select_version(
"v1beta1"
).create_feature_monitor_job(
parent=self.resource_name,
feature_monitor_job=gapic_feature_monitor_job,
metadata=request_metadata,
timeout=create_request_timeout,
)
feature_monitor_job_obj = self.FeatureMonitorJob(
name=created_feature_monitor_job.name,
project=project,
location=location,
credentials=credentials,
)
return feature_monitor_job_obj
def get_feature_monitor_job(
self,
feature_monitor_job_id: str,
credentials: Optional[auth_credentials.Credentials] = None,
) -> FeatureMonitorJob:
"""Retrieves an existing feature monitor.
Args:
feature_monitor_job_id: The ID of the feature monitor job.
credentials:
Custom credentials to use to retrieve the feature monitor job under this
feature monitor. The order of which credentials are used is as
follows - (1) this parameter (2) credentials passed to FeatureMonitor
constructor (3) credentials set in aiplatform.init.
Returns:
FeatureMonitorJob - the Feature Monitor Job resource object under this
feature monitor.
"""
credentials = (
credentials or self.credentials or initializer.global_config.credentials
)
return FeatureMonitor.FeatureMonitorJob(
f"{self.resource_name}/featureMonitorJobs/{feature_monitor_job_id}",
credentials=credentials,
)
def list_feature_monitor_jobs(
self,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
) -> List[FeatureMonitorJob]:
"""Lists features monitor jobs under this feature monitor.
Args:
project:
Project to list feature monitors in. If unset, the project set in
aiplatform.init will be used.
location:
Location to list feature monitors in. If not set, location set in
aiplatform.init will be used.
credentials:
Custom credentials to use to list feature monitors. Overrides
credentials set in aiplatform.init.
Returns:
List of feature monitor jobs under this feature monitor.
"""
return FeatureMonitor.FeatureMonitorJob.list(
parent=self.resource_name,
project=project,
location=location,
credentials=credentials,
)

View File

@@ -0,0 +1,645 @@
# -*- coding: utf-8 -*-
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import enum
from typing import (
Dict,
List,
Optional,
Sequence,
Tuple,
Union,
)
from google.auth import credentials as auth_credentials
from google.cloud.aiplatform import (
base,
initializer,
utils,
)
from google.cloud.aiplatform.compat.types import (
feature_online_store as gca_feature_online_store,
service_networking as gca_service_networking,
feature_view as gca_feature_view,
)
from vertexai.resources.preview.feature_store.feature_view import (
FeatureView,
)
from vertexai.resources.preview.feature_store.utils import (
IndexConfig,
FeatureViewBigQuerySource,
FeatureViewVertexRagSource,
FeatureViewRegistrySource,
)
_LOGGER = base.Logger(__name__)
@enum.unique
class FeatureOnlineStoreType(enum.Enum):
UNKNOWN = 0
BIGTABLE = 1
OPTIMIZED = 2
class FeatureOnlineStore(base.VertexAiResourceNounWithFutureManager):
"""Class for managing Feature Online Store resources."""
client_class = utils.FeatureOnlineStoreAdminClientWithOverride
_resource_noun = "feature_online_stores"
_getter_method = "get_feature_online_store"
_list_method = "list_feature_online_stores"
_delete_method = "delete_feature_online_store"
_parse_resource_name_method = "parse_feature_online_store_path"
_format_resource_name_method = "feature_online_store_path"
_gca_resource: gca_feature_online_store.FeatureOnlineStore
def __init__(
self,
name: str,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
):
"""Retrieves an existing managed feature online store.
Args:
name:
The resource name
(`projects/.../locations/.../featureOnlineStores/...`) or ID.
project:
Project to retrieve feature online store from. If unset, the
project set in aiplatform.init will be used.
location:
Location to retrieve feature online store from. If not set,
location set in aiplatform.init will be used.
credentials:
Custom credentials to use to retrieve this feature online store.
Overrides credentials set in aiplatform.init.
"""
super().__init__(
project=project,
location=location,
credentials=credentials,
resource_name=name,
)
self._gca_resource = self._get_gca_resource(resource_name=name)
@classmethod
@base.optional_sync()
def create_bigtable_store(
cls,
name: str,
min_node_count: Optional[int] = 1,
max_node_count: Optional[int] = 1,
cpu_utilization_target: Optional[int] = 50,
labels: Optional[Dict[str, str]] = None,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
request_metadata: Optional[Sequence[Tuple[str, str]]] = None,
create_request_timeout: Optional[float] = None,
sync: bool = True,
) -> "FeatureOnlineStore":
"""Creates a Bigtable online store.
Example Usage:
my_fos = vertexai.preview.FeatureOnlineStore.create_bigtable_store('my_fos')
Args:
name: The name of the feature online store.
min_node_count:
The minimum number of Bigtable nodes to scale down to. Must be
greater than or equal to 1.
max_node_count:
The maximum number of Bigtable nodes to scale up to. Must
satisfy min_node_count <= max_node_count <= (10 *
min_node_count).
cpu_utilization_target:
A percentage of the cluster's CPU capacity. Can be from 10% to
80%. When a cluster's CPU utilization exceeds the target that
you have set, Bigtable immediately adds nodes to the cluster.
When CPU utilization is substantially lower than the target,
Bigtable removes nodes. If not set will default to 50%.
labels:
The labels with user-defined metadata to organize your feature
online store. Label keys and values can be no longer than 64
characters (Unicode codepoints), can only contain lowercase
letters, numeric characters, underscores and dashes.
International characters are allowed. See https://goo.gl/xmQnxf
for more information on and examples of labels. No more than 64
user labels can be associated with one feature online store
(System labels are excluded)." System reserved label keys are
prefixed with "aiplatform.googleapis.com/" and are immutable.
project:
Project to create feature online store in. If unset, the project
set in aiplatform.init will be used.
location:
Location to create feature online store in. If not set, location
set in aiplatform.init will be used.
credentials:
Custom credentials to use to create this feature online store.
Overrides credentials set in aiplatform.init.
request_metadata:
Strings which should be sent along with the request as metadata.
create_request_timeout:
The timeout for the create request in seconds.
sync:
Whether to execute this creation synchronously. If False, this
method will be executed in concurrent Future and any downstream
object will be immediately returned and synced when the Future
has completed.
Returns:
FeatureOnlineStore - the FeatureOnlineStore resource object.
"""
if min_node_count < 1:
raise ValueError("min_node_count must be greater than or equal to 1")
if max_node_count < min_node_count:
raise ValueError(
"max_node_count must be greater than or equal to min_node_count"
)
elif 10 * min_node_count < max_node_count:
raise ValueError(
"max_node_count must be less than or equal to 10 * min_node_count"
)
if cpu_utilization_target < 10 or cpu_utilization_target > 80:
raise ValueError("cpu_utilization_target must be between 10 and 80")
gapic_feature_online_store = gca_feature_online_store.FeatureOnlineStore(
bigtable=gca_feature_online_store.FeatureOnlineStore.Bigtable(
auto_scaling=gca_feature_online_store.FeatureOnlineStore.Bigtable.AutoScaling(
min_node_count=min_node_count,
max_node_count=max_node_count,
cpu_utilization_target=cpu_utilization_target,
),
),
)
if labels:
utils.validate_labels(labels)
gapic_feature_online_store.labels = labels
if request_metadata is None:
request_metadata = ()
api_client = cls._instantiate_client(location=location, credentials=credentials)
create_online_store_lro = api_client.create_feature_online_store(
parent=initializer.global_config.common_location_path(
project=project, location=location
),
feature_online_store=gapic_feature_online_store,
feature_online_store_id=name,
metadata=request_metadata,
timeout=create_request_timeout,
)
_LOGGER.log_create_with_lro(cls, create_online_store_lro)
created_online_store = create_online_store_lro.result()
_LOGGER.log_create_complete(cls, created_online_store, "feature_online_store")
online_store_obj = cls(
name=created_online_store.name,
project=project,
location=location,
credentials=credentials,
)
return online_store_obj
@classmethod
@base.optional_sync()
def create_optimized_store(
cls,
name: str,
enable_private_service_connect: bool = False,
project_allowlist: Optional[Sequence[str]] = None,
labels: Optional[Dict[str, str]] = None,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
request_metadata: Optional[Sequence[Tuple[str, str]]] = None,
create_request_timeout: Optional[float] = None,
sync: bool = True,
) -> "FeatureOnlineStore":
"""Creates an Optimized online store.
Example Usage:
```
# Create optimized store with public endpoint.
my_fos = vertexai.preview.FeatureOnlineStore.create_optimized_store('my_fos')
```
```
# Create optimized online store with private service connect.
my_fos = vertexai.preview.FeatureOnlineStore.create_optimized_store(
'my_fos',
enable_private_service_connect=True,
project_allowlist=['my-project'],
)
```
Args:
name: The name of the feature online store.
enable_private_service_connect:
Optional. If true, expose the optimized online store
via private service connect. Otherwise the optimized online
store will be accessible through public endpoint.
project_allowlist:
A list of Projects from which the forwarding
rule will target the service attachment. Only needed when
`enable_private_service_connect` is set to true.
labels:
The labels with user-defined metadata to organize your feature
online store. Label keys and values can be no longer than 64
characters (Unicode codepoints), can only contain lowercase
letters, numeric characters, underscores and dashes.
International characters are allowed. See https://goo.gl/xmQnxf
for more information on and examples of labels. No more than 64
user labels can be associated with one feature online store
(System labels are excluded)." System reserved label keys are
prefixed with "aiplatform.googleapis.com/" and are immutable.
project:
Project to create feature online store in. If unset, the project
set in aiplatform.init will be used.
location:
Location to create feature online store in. If not set, location
set in aiplatform.init will be used.
credentials:
Custom credentials to use to create this feature online store.
Overrides credentials set in aiplatform.init.
request_metadata:
Strings which should be sent along with the request as metadata.
create_request_timeout:
The timeout for the create request in seconds.
sync:
Whether to execute this creation synchronously. If False, this
method will be executed in concurrent Future and any downstream
object will be immediately returned and synced when the Future
has completed.
Returns:
FeatureOnlineStore - the FeatureOnlineStore resource object.
"""
if enable_private_service_connect:
if not project_allowlist:
raise ValueError(
"`project_allowlist` cannot be empty when `enable_private_service_connect` is set to true."
)
dedicated_serving_endpoint = gca_feature_online_store.FeatureOnlineStore.DedicatedServingEndpoint(
private_service_connect_config=gca_service_networking.PrivateServiceConnectConfig(
enable_private_service_connect=True,
project_allowlist=project_allowlist,
),
)
else:
dedicated_serving_endpoint = (
gca_feature_online_store.FeatureOnlineStore.DedicatedServingEndpoint()
)
gapic_feature_online_store = gca_feature_online_store.FeatureOnlineStore(
optimized=gca_feature_online_store.FeatureOnlineStore.Optimized(),
dedicated_serving_endpoint=dedicated_serving_endpoint,
)
if labels:
utils.validate_labels(labels)
gapic_feature_online_store.labels = labels
if request_metadata is None:
request_metadata = ()
api_client = cls._instantiate_client(location=location, credentials=credentials)
create_online_store_lro = api_client.create_feature_online_store(
parent=initializer.global_config.common_location_path(
project=project, location=location
),
feature_online_store=gapic_feature_online_store,
feature_online_store_id=name,
metadata=request_metadata,
timeout=create_request_timeout,
)
_LOGGER.log_create_with_lro(cls, create_online_store_lro)
created_online_store = create_online_store_lro.result()
_LOGGER.log_create_complete(cls, created_online_store, "feature_online_store")
online_store_obj = cls(
name=created_online_store.name,
project=project,
location=location,
credentials=credentials,
)
return online_store_obj
@base.optional_sync()
def delete(self, force: bool = False, sync: bool = True) -> None:
"""Deletes this online store.
WARNING: This deletion is permanent.
Args:
force:
If set to True, all feature views under this online store will
be deleted prior to online store deletion. Otherwise, deletion
will only succeed if the online store has no FeatureViews.
sync:
Whether to execute this deletion synchronously. If False, this
method will be executed in concurrent Future and any downstream
object will be immediately returned and synced when the Future
has completed.
"""
lro = getattr(self.api_client, self._delete_method)(
name=self.resource_name,
force=force,
)
_LOGGER.log_delete_with_lro(self, lro)
lro.result()
_LOGGER.log_delete_complete(self)
@property
def feature_online_store_type(self) -> FeatureOnlineStoreType:
if self._gca_resource.bigtable:
return FeatureOnlineStoreType.BIGTABLE
# Optimized is an empty proto, so self._gca_resource.optimized is always false.
elif hasattr(self.gca_resource, "optimized"):
return FeatureOnlineStoreType.OPTIMIZED
else:
raise ValueError(
f"Online store does not have type or is unsupported by SDK: {self._gca_resource}."
)
@property
def labels(self) -> Dict[str, str]:
return self._gca_resource.labels
@base.optional_sync()
def create_feature_view(
self,
name: str,
source: Union[
FeatureViewBigQuerySource,
FeatureViewVertexRagSource,
FeatureViewRegistrySource,
],
labels: Optional[Dict[str, str]] = None,
sync_config: Optional[str] = None,
index_config: Optional[IndexConfig] = None,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
request_metadata: Optional[Sequence[Tuple[str, str]]] = None,
create_request_timeout: Optional[float] = None,
sync: bool = True,
) -> FeatureView:
"""Creates a FeatureView from a BigQuery source.
Example Usage:
```
existing_fos = FeatureOnlineStore('my_fos')
new_fv = existing_fos.create_feature_view(
'my_fos',
BigQuerySource(
uri='bq://my-proj/dataset/table',
entity_id_columns=['entity_id'],
)
)
# Example for how to create an embedding FeatureView.
embedding_fv = existing_fos.create_feature_view(
'my_fos',
BigQuerySource(
uri='bq://my-proj/dataset/table',
entity_id_columns=['entity_id'],
)
index_config=IndexConfig(
embedding_column="embedding",
filter_column=["currency_code", "gender",
crowding_column="crowding",
dimentions=1536,
distance_measure_type=DistanceMeasureType.SQUARED_L2_DISTANCE,
algorithm_config=TreeAhConfig(),
)
)
```
Args:
name: The name of the feature view.
source:
The source to load data from when a feature view sync runs.
Currently supports a BigQuery source, Vertex RAG source, Registry source.
labels:
The labels with user-defined metadata to organize your
FeatureViews.
Label keys and values can be no longer than 64 characters
(Unicode codepoints), can only contain lowercase letters,
numeric characters, underscores and dashes. International
characters are allowed.
See https://goo.gl/xmQnxf for more information on and examples
of labels. No more than 64 user labels can be associated with
one FeatureOnlineStore(System labels are excluded)." System
reserved label keys are prefixed with
"aiplatform.googleapis.com/" and are immutable.
sync_config:
Configures when data is to be synced/updated for this
FeatureView. At the end of the sync the latest feature values
for each entity ID of this FeatureView are made ready for online
serving. Example format: "TZ=America/New_York 0 9 * * *" (sync
daily at 9 AM EST).
index_config:
Configuration for index preparation for vector search. It
contains the required configurations to create an index from
source data, so that approximate nearest neighbor (a.k.a ANN)
algorithms search can be performed during online serving.
project:
Project to create feature view in. If unset, the project set in
aiplatform.init will be used.
location:
Location to create feature view in. If not set, location set in
aiplatform.init will be used.
credentials:
Custom credentials to use to create this feature view.
Overrides credentials set in aiplatform.init.
request_metadata:
Strings which should be sent along with the request as metadata.
create_request_timeout:
The timeout for the create request in seconds.
sync:
Whether to execute this creation synchronously. If False, this
method will be executed in concurrent Future and any downstream
object will be immediately returned and synced when the Future
has completed.
Returns:
FeatureView - the FeatureView resource object.
"""
if not source:
raise ValueError("Please specify a valid source.")
big_query_source = None
vertex_rag_source = None
feature_registry_source = None
if isinstance(source, FeatureViewBigQuerySource):
if not source.uri:
raise ValueError("Please specify URI in BigQuery source.")
if not source.entity_id_columns:
raise ValueError("Please specify entity ID columns in BigQuery source.")
big_query_source = gca_feature_view.FeatureView.BigQuerySource(
uri=source.uri,
entity_id_columns=source.entity_id_columns,
)
elif isinstance(source, FeatureViewVertexRagSource):
if not source.uri:
raise ValueError("Please specify URI in Vertex RAG source.")
vertex_rag_source = gca_feature_view.FeatureView.VertexRagSource(
uri=source.uri,
rag_corpus_id=source.rag_corpus_id or None,
)
elif isinstance(source, FeatureViewRegistrySource):
if not source.features:
raise ValueError(
"Please specify features in Registry Source in format `<feature_group_id>.<feature_id>`."
)
feature_group_mappings = {}
for feature in source.features:
feature_group_id, feature_id = feature.split(".")
if not feature_id or not feature_group_id:
raise ValueError(
"Please specify features in Registry Source in format `<feature_group_id>.<feature_id>`."
)
if feature_group_id in feature_group_mappings:
feature_group_mappings[feature_group_id].append(feature_id)
else:
feature_group_mappings[feature_group_id] = [feature_id]
feature_groups = []
for feature_group_id in feature_group_mappings:
feature_ids = feature_group_mappings[feature_group_id]
feature_groups.append(
gca_feature_view.FeatureView.FeatureRegistrySource.FeatureGroup(
feature_group_id=feature_group_id,
feature_ids=feature_ids,
)
)
feature_registry_source = (
gca_feature_view.FeatureView.FeatureRegistrySource(
feature_groups=feature_groups,
project_number=source.project_number or None,
)
)
else:
raise ValueError(
"Only FeatureViewBigQuerySource, FeatureViewVertexRagSource and FeatureViewRegistrySource are supported sources."
)
gapic_feature_view = gca_feature_view.FeatureView(
big_query_source=big_query_source,
vertex_rag_source=vertex_rag_source,
feature_registry_source=feature_registry_source,
sync_config=gca_feature_view.FeatureView.SyncConfig(cron=sync_config)
if sync_config
else None,
)
if labels:
utils.validate_labels(labels)
gapic_feature_view.labels = labels
if request_metadata is None:
request_metadata = ()
if index_config:
gapic_feature_view.index_config = gca_feature_view.FeatureView.IndexConfig(
index_config.as_dict()
)
api_client = self.__class__._instantiate_client(
location=location, credentials=credentials
)
create_feature_view_lro = api_client.create_feature_view(
parent=self.resource_name,
feature_view=gapic_feature_view,
feature_view_id=name,
metadata=request_metadata,
timeout=create_request_timeout,
)
_LOGGER.log_create_with_lro(FeatureView, create_feature_view_lro)
created_feature_view = create_feature_view_lro.result()
_LOGGER.log_create_complete(FeatureView, created_feature_view, "feature_view")
feature_view_obj = FeatureView(
name=created_feature_view.name,
project=project,
location=location,
credentials=credentials,
)
return feature_view_obj
def list_feature_views(
self,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
) -> List[FeatureView]:
"""Lists feature views under this feature online store.
Args:
project:
Project to list feature views in. If unset, the project set in
aiplatform.init will be used.
location:
Location to list feature views in. If not set, location set in
aiplatform.init will be used.
credentials:
Custom credentials to use to list feature views. Overrides
credentials set in aiplatform.init.
Returns:
List of feature views under this feature online store.
"""
return FeatureView.list(
feature_online_store_id=self.name,
project=project,
location=location,
credentials=credentials,
)

View File

@@ -0,0 +1,539 @@
# -*- coding: utf-8 -*-
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
from typing import List, Dict, Optional
from google.cloud.aiplatform import initializer
from google.auth import credentials as auth_credentials
from google.cloud.aiplatform import base
from google.cloud.aiplatform import utils
from google.cloud.aiplatform.compat.types import (
feature_view_sync as gca_feature_view_sync,
feature_view as gca_feature_view,
feature_online_store_service as fos_service,
)
import vertexai.resources.preview.feature_store.utils as fs_utils
_LOGGER = base.Logger(__name__)
class FeatureView(base.VertexAiResourceNounWithFutureManager):
"""Class for managing Feature View resources."""
client_class = utils.FeatureOnlineStoreAdminClientWithOverride
_resource_noun = "featureViews"
_getter_method = "get_feature_view"
_list_method = "list_feature_views"
_delete_method = "delete_feature_view"
_parse_resource_name_method = "parse_feature_view_path"
_format_resource_name_method = "feature_view_path"
_gca_resource: gca_feature_view.FeatureView
_online_store_client: utils.FeatureOnlineStoreClientWithOverride
_online_store_clients_with_connection_options: Dict[
fs_utils.ConnectionOptions, utils.FeatureOnlineStoreClientWithOverride
] = None
def __init__(
self,
name: str,
feature_online_store_id: Optional[str] = None,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
):
"""Retrieves an existing managed feature view.
Args:
name:
The resource name
(`projects/.../locations/.../featureOnlineStores/.../featureViews/...`)
or ID.
feature_online_store_id:
The feature online store ID. Must be passed in if name is an ID
and not a resource path.
project:
Project to retrieve the feature view from. If unset, the project
set in aiplatform.init will be used.
location:
Location to retrieve the feature view from. If not set, location
set in aiplatform.init will be used.
credentials:
Custom credentials to use to retrieve this feature view.
Overrides credentials set in aiplatform.init.
"""
super().__init__(
project=project,
location=location,
credentials=credentials,
resource_name=name,
)
if re.fullmatch(
r"projects/.+/locations/.+/featureOnlineStores/.+/featureViews/.+",
name,
):
feature_view = name
else:
from .feature_online_store import FeatureOnlineStore
# Construct the feature view path using feature online store ID if
# only the feature view ID is provided.
if not feature_online_store_id:
raise ValueError(
"Since feature view is not provided as a path, please specify"
+ " feature_online_store_id."
)
feature_online_store_path = utils.full_resource_name(
resource_name=feature_online_store_id,
resource_noun=FeatureOnlineStore._resource_noun,
parse_resource_name_method=FeatureOnlineStore._parse_resource_name,
format_resource_name_method=FeatureOnlineStore._format_resource_name,
)
feature_view = f"{feature_online_store_path}/featureViews/{name}"
self._gca_resource = self._get_gca_resource(resource_name=feature_view)
def _get_online_store_client(
self, connection_options: Optional[fs_utils.ConnectionOptions] = None
) -> utils.FeatureOnlineStoreClientWithOverride:
"""Return the online store client.
Also sets the `_online_store_client` attr if not set yet. Note that if
`connection_options` is passed in, the `_online_store_client` attr will
not be set - only the client will be returned. If the same
`connection_options` is passed in, this code will return the same
(cached) client as previously built.
"""
if getattr(self, "_online_store_client", None):
return self._online_store_client
fos_name = fs_utils.get_feature_online_store_name(self.resource_name)
from .feature_online_store import FeatureOnlineStore
fos = FeatureOnlineStore(name=fos_name)
if connection_options:
# Check if we have a previously client created for these
# connection_options.
if self._online_store_clients_with_connection_options is None:
self._online_store_clients_with_connection_options = {}
if connection_options in self._online_store_clients_with_connection_options:
return self._online_store_clients_with_connection_options[
connection_options
]
host = connection_options.host
if isinstance(
connection_options.transport,
fs_utils.ConnectionOptions.InsecureGrpcChannel,
):
import grpc
from google.cloud.aiplatform_v1.services import (
feature_online_store_service as feature_online_store_service_v1,
)
from google.cloud.aiplatform_v1beta1.services import (
feature_online_store_service as feature_online_store_service_v1beta1,
)
gapic_client_class = (
utils.FeatureOnlineStoreClientWithOverride.get_gapic_client_class()
)
gapic_client_class_to_transport_class = {
feature_online_store_service_v1.client.FeatureOnlineStoreServiceClient: (
feature_online_store_service_v1.transports.grpc.FeatureOnlineStoreServiceGrpcTransport
),
feature_online_store_service_v1beta1.client.FeatureOnlineStoreServiceClient: (
feature_online_store_service_v1beta1.transports.grpc.FeatureOnlineStoreServiceGrpcTransport
),
}
if gapic_client_class not in gapic_client_class_to_transport_class:
raise ValueError(
f"Unexpected gapic class '{gapic_client_class}' used by internal client."
)
transport_class = gapic_client_class_to_transport_class[
gapic_client_class
]
client = gapic_client_class(
transport=transport_class(
channel=grpc.insecure_channel(host + ":10002")
),
)
self._online_store_clients_with_connection_options[
connection_options
] = client
return client
else:
raise ValueError(
f"Unsupported connection transport type, got transport: {connection_options.transport}"
)
if fos._gca_resource.bigtable.auto_scaling:
# This is Bigtable online store.
_LOGGER.info(f"Connecting to Bigtable online store name {fos_name}")
self._online_store_client = initializer.global_config.create_client(
client_class=utils.FeatureOnlineStoreClientWithOverride,
credentials=self.credentials,
location_override=self.location,
)
return self._online_store_client
if (
fos._gca_resource.dedicated_serving_endpoint.private_service_connect_config.enable_private_service_connect
):
raise ValueError(
"Use `connection_options` to specify an IP address. Required for optimized online store with private service connect."
)
# From here, optimized serving with public endpoint.
if not fos._gca_resource.dedicated_serving_endpoint.public_endpoint_domain_name:
raise fs_utils.PublicEndpointNotFoundError(
"Public endpoint is not created yet for the optimized online store:"
f"{fos_name}. Please run sync and wait for it to complete."
)
_LOGGER.info(
f"Public endpoint for the optimized online store {fos_name} is"
f" {fos._gca_resource.dedicated_serving_endpoint.public_endpoint_domain_name}"
)
self._online_store_client = initializer.global_config.create_client(
client_class=utils.FeatureOnlineStoreClientWithOverride,
credentials=self.credentials,
location_override=self.location,
prediction_client=True,
api_path_override=fos._gca_resource.dedicated_serving_endpoint.public_endpoint_domain_name,
)
return self._online_store_client
@classmethod
def list(
cls,
feature_online_store_id: str,
filter: Optional[str] = None,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
) -> List["FeatureView"]:
"""List all feature view under feature_online_store_id.
Example Usage:
```
feature_views = vertexai.preview.FeatureView.list(
feature_online_store_id="my_fos",
filter=labels.label_key=label_value)
```
Args:
feature_online_store_id:
Parentfeature online store ID.
filter:
Filter to apply on the returned feature online store.
project:
Project to use to get a list of feature views. If unset, the
project set in aiplatform.init will be used.
location:
Location to use to get a list feature views. If not set,
location set in aiplatform.init will be used.
credentials:
Custom credentials to use to get a list of feature views.
Overrides credentials set in aiplatform.init.
Returns:
List[FeatureView] - list of FeatureView resource object.
"""
from .feature_online_store import FeatureOnlineStore
fos = FeatureOnlineStore(
name=feature_online_store_id,
project=project,
location=location,
credentials=credentials,
)
return cls._list(
filter=filter, credentials=credentials, parent=fos.resource_name
)
@base.optional_sync()
def delete(self, sync: bool = True) -> None:
"""Deletes this feature view.
WARNING: This deletion is permanent.
Args:
sync:
Whether to execute this deletion synchronously. If False, this
method will be executed in concurrent Future and any downstream
object will be immediately returned and synced when the Future
has completed.
"""
lro = getattr(self.api_client, self._delete_method)(name=self.resource_name)
_LOGGER.log_delete_with_lro(self, lro)
lro.result()
_LOGGER.log_delete_complete(self)
def sync(self) -> "FeatureViewSync":
"""Starts an on-demand Sync for the FeatureView.
Args: None
Returns:
"FeatureViewSync" - FeatureViewSync instance
"""
sync_method = getattr(self.api_client, self.FeatureViewSync.sync_method())
sync_request = {
"feature_view": self.resource_name,
}
sync_response = sync_method(request=sync_request)
return self.FeatureViewSync(name=sync_response.feature_view_sync)
def get_sync(self, name) -> "FeatureViewSync":
"""Gets the FeatureViewSync resource for the given name.
Args:
name: The resource ID
Returns:
"FeatureViewSync" - FeatureViewSync instance
"""
feature_view_path = self.resource_name
feature_view_sync = f"{feature_view_path}/featureViewSyncs/{name}"
return self.FeatureViewSync(name=feature_view_sync)
def list_syncs(
self,
filter: Optional[str] = None,
) -> List["FeatureViewSync"]:
"""List all feature view under this FeatureView.
Args:
parent_resource_name: Fully qualified name of the parent FeatureView
resource.
filter: Filter to apply on the returned feature online store.
Returns:
List[FeatureViewSync] - list of FeatureViewSync resource object.
"""
return self.FeatureViewSync._list(
filter=filter, credentials=self.credentials, parent=self.resource_name
)
def read(
self,
key: List[str],
connection_options: Optional[fs_utils.ConnectionOptions] = None,
request_timeout: Optional[float] = None,
) -> fs_utils.FeatureViewReadResponse:
"""Read the feature values from FeatureView.
Example Usage:
Read feature view. Use this for Bigtable online stores and for
Optimized online stores that use public endpoint.
```
data = vertexai.preview.FeatureView(
name='feature_view_name', feature_online_store_id='fos_name')
.read(key=[12345, 6789])
.to_dict()
```
Read feature view using IP with an insecure gRPC channel. Use this
for optimized online stores using private service connect.
```
data = vertexai.preview.FeatureView(
name='feature_view_name', feature_online_store_id='fos_name')
.read(
key=[12345, 6789],
connection_options=fs_utils.ConnectionOptions(
host="<ip>",
transport=fs_utils.ConnectionOptions.InsecureGrpcChannel()))
.to_dict()
```
Args:
key: The request key to read feature values for.
connection_options:
If specified, use these options to connect to a host for sending
requests instead of the default
`<region>-aiplatform.googleapis.com` or the feature online
store's public endpoint.
Returns:
"FeatureViewReadResponse" - FeatureViewReadResponse object. It is
intermediate class that can be further converted by to_dict() or
to_proto().
"""
self.wait()
online_store_client = self._get_online_store_client(
connection_options=connection_options
)
response = online_store_client.fetch_feature_values(
feature_view=self.resource_name,
data_key=fos_service.FeatureViewDataKey(
composite_key=fos_service.FeatureViewDataKey.CompositeKey(parts=key)
),
timeout=request_timeout,
)
return fs_utils.FeatureViewReadResponse(response)
def search(
self,
entity_id: Optional[str] = None,
embedding_value: Optional[List[float]] = None,
neighbor_count: Optional[int] = None,
string_filters: Optional[
List[fos_service.NearestNeighborQuery.StringFilter]
] = None,
per_crowding_attribute_neighbor_count: Optional[int] = None,
return_full_entity: bool = False,
approximate_neighbor_candidates: Optional[int] = None,
leaf_nodes_search_fraction: Optional[float] = None,
request_timeout: Optional[float] = None,
) -> fs_utils.SearchNearestEntitiesResponse:
"""Search the nearest entities from FeatureView.
Example Usage:
```
data = vertexai.preview.FeatureView(
name='feature_view_name', feature_online_store_id='fos_name')
.search(entity_id='sample_entity')
.to_dict()
```
Args:
entity_id: The entity id whose similar entities should be searched
for.
embedding_value: The embedding vector that be used for similar
search.
neighbor_count: The number of similar entities to be retrieved
from feature view for each query.
string_filters: The list of string filters.
per_crowding_attribute_neighbor_count: Crowding is a constraint on a
neighbor list produced by nearest neighbor search requiring that
no more than sper_crowding_attribute_neighbor_count of the k
neighbors returned have the same value of crowding_attribute.
It's used for improving result diversity.
return_full_entity: If true, return full entities including the
features other than embeddings.
approximate_neighbor_candidates: The number of neighbors to find via
approximate search before exact reordering is performed; if set,
this value must be > neighbor_count.
leaf_nodes_search_fraction: The fraction of the number of leaves to
search, set at query time allows user to tune search performance.
This value increase result in both search accuracy and latency
increase. The value should be between 0.0 and 1.0.
Returns:
"SearchNearestEntitiesResponse" - SearchNearestEntitiesResponse
object. It is intermediate class that can be further converted by
to_dict() or to_proto()
"""
self.wait()
if entity_id:
embedding = None
elif embedding_value:
embedding = fos_service.NearestNeighborQuery.Embedding(
value=embedding_value
)
else:
raise ValueError(
"Either entity_id or embedding_value needs to be provided for search."
)
response = self._get_online_store_client().search_nearest_entities(
request=fos_service.SearchNearestEntitiesRequest(
feature_view=self.resource_name,
query=fos_service.NearestNeighborQuery(
entity_id=entity_id,
embedding=embedding,
neighbor_count=neighbor_count,
string_filters=string_filters,
per_crowding_attribute_neighbor_count=per_crowding_attribute_neighbor_count, # pylint: disable=line-too-long
parameters=fos_service.NearestNeighborQuery.Parameters(
approximate_neighbor_candidates=approximate_neighbor_candidates,
leaf_nodes_search_fraction=leaf_nodes_search_fraction,
),
),
return_full_entity=return_full_entity,
),
timeout=request_timeout,
)
return fs_utils.SearchNearestEntitiesResponse(response)
class FeatureViewSync(base.VertexAiResourceNounWithFutureManager):
"""Class for managing Feature View Sync resources."""
client_class = utils.FeatureOnlineStoreAdminClientWithOverride
_resource_noun = "featureViewSyncs"
_getter_method = "get_feature_view_sync"
_list_method = "list_feature_view_syncs"
_delete_method = "delete_feature_view"
_sync_method = "sync_feature_view"
_parse_resource_name_method = "parse_feature_view_sync_path"
_format_resource_name_method = "feature_view_sync_path"
_gca_resource: gca_feature_view_sync.FeatureViewSync
def __init__(
self,
name: str,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
):
"""Retrieves an existing managed feature view sync.
Args:
name: The resource name
(`projects/.../locations/.../featureOnlineStores/.../featureViews/.../featureViewSyncs/...`)
project: Project to retrieve the feature view from. If unset, the
project set in aiplatform.init will be used.
location: Location to retrieve the feature view from. If not set,
location set in aiplatform.init will be used.
credentials: Custom credentials to use to retrieve this feature view.
Overrides credentials set in aiplatform.init.
"""
super().__init__(
project=project,
location=location,
credentials=credentials,
resource_name=name,
)
if not re.fullmatch(
r"projects/.+/locations/.+/featureOnlineStores/.+/featureViews/.+/featureViewSyncs/.+",
name,
):
raise ValueError(
"name need to specify the fully qualified"
+ " feature_view_sync resource path."
)
self._gca_resource = getattr(self.api_client, self._getter_method)(
name=name, retry=base._DEFAULT_RETRY
)
@classmethod
def sync_method(cls) -> str:
"""Returns the sync method."""
return cls._sync_method

View File

@@ -0,0 +1,291 @@
# -*- coding: utf-8 -*-
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import datetime
import re
from typing import Optional, List, Tuple, Union, TYPE_CHECKING
from google.auth import credentials as auth_credentials
from vertexai.resources.preview.feature_store import (
FeatureGroup,
Feature,
)
from google.cloud.aiplatform import initializer, __version__
from . import _offline_store_impl as impl
if TYPE_CHECKING:
try:
import bigframes
except ImportError:
bigframes = None
try:
import pandas as pd
except ImportError:
pd = None
def _try_import_bigframes():
"""Try to import `bigframes` and return it if successful - otherwise raise an import error."""
try:
import bigframes
import bigframes.pandas
return bigframes
except ImportError as exc:
raise ImportError(
"`bigframes` is not installed but required for this functionality."
) from exc
def _get_feature_group_from_feature(
feature: Feature, credentials: auth_credentials.Credentials
):
"""Given a feature, return the feature group resource."""
result = re.fullmatch(
r"projects/(?P<project>.+)/locations/(?P<location>.+)/featureGroups/(?P<feature_group>.+)/features/.+",
feature.resource_name,
)
if not result:
raise ValueError("Couldn't find feature group in feature.")
project = feature.project
location = feature.location
feature_group = result.group("feature_group")
return FeatureGroup(
feature_group, project=project, location=location, credentials=credentials
)
def _extract_feature_from_str_repr(
str_feature: str, credentials: auth_credentials.Credentials
) -> Tuple[FeatureGroup, Feature]:
"""Given a feature in string representation, return the feature and feature group."""
# TODO: compile expr + place it in a constant
result = re.fullmatch(
r"((?P<project>.*)\.)?(?P<feature_group>.*)\.(?P<feature>.*)",
str_feature,
)
if not result:
raise ValueError(
f"Feature '{str_feature}' is a string but not in expected format 'feature_group.feature' or 'project.feature_group.feature'."
)
feature_group = FeatureGroup(
result.group("feature_group"),
project=result.group("project"), # None if no match.
credentials=credentials,
)
feature = feature_group.get_feature(result.group("feature"))
return (feature_group, feature)
def _feature_to_data_source(
feature_group: FeatureGroup, feature: Feature
) -> impl.DataSource:
qualifying_name = f"{feature_group.name}__{feature.name}"
gbq_column = feature.version_column_name
assert gbq_column
column_name = feature.name
assert column_name
timestamp_column = "feature_timestamp"
# TODO: Expose entity_id_columns as a property in FeatureGroup
entity_id_columns = feature_group._gca_resource.big_query.entity_id_columns
assert entity_id_columns
bq_uri = feature_group._gca_resource.big_query.big_query_source.input_uri
assert bq_uri
fully_qualified_table = bq_uri.lstrip("bq://")
assert fully_qualified_table
query = (
f"SELECT\n"
f' {", ".join(entity_id_columns)},\n'
f" {gbq_column} AS {column_name},\n"
f" {timestamp_column}\n"
f"FROM {fully_qualified_table}"
)
return impl.DataSource(
qualifying_name=qualifying_name,
sql=query,
data_columns=[column_name],
# TODO: this will be parameterized in the future
timestamp_column=timestamp_column,
entity_id_columns=entity_id_columns,
)
class _DataFrameToBigQueryDataFramesConverter:
@classmethod
def to_bigquery_dataframe(
cls, df: "pd.DataFrame", session: "Optional[bigframes.session.Session]" = None
) -> "bigframes.pandas.DataFrame":
bigframes = _try_import_bigframes()
return bigframes.pandas.DataFrame(data=df, session=session)
def fetch_historical_feature_values(
entity_df: "bigframes.pandas.DataFrame",
# TODO: Add support for FeatureView | FeatureGroup | bigframes.pandas.DataFrame
features: List[Union[str, Feature]],
# TODO: Add support for feature_age_threshold
feature_age_threshold: Optional[datetime.timedelta] = None,
dry_run: bool = False,
project: Optional[str] = None,
location: Optional[str] = None,
credentials: Optional[auth_credentials.Credentials] = None,
) -> "Union[bigframes.pandas.DataFrame, None]":
"""Fetch historical data at the timestamp specified for each entity.
This runs a Point-In-Time Lookup (PITL) query in BigQuery across all
features and returns the historical feature values. Feature data will be
joined by matching their entity_id_column(s) with corresponding columns in
the entity data frame.
Args:
entity_df:
An entity DataFrame where one/multiple columns have entity ID.
One column should have a timestamp (used for feature lookup). Other
columns may have feature data. Entity IDs may be repeated with
different timestamp values (in the timestamp column) to lookup data for
entities at different points in time.
features:
Feature data will be joined with the entity data frame.
* If `str` is given use `project.feature_group.feature` as the format.
`project_id.feature_group_id.feature_id` may be used if features are
in another project.
* If `FeatureView` is given, the *sources* of the FeatureView will be
used - but data will be read from the backing BigQuery table.
feature_age_threshold:
How far back from the timestamp to look for features values. If no
feature values are found, empty/null value will be populated.
dry_run:
Build the Point-In-Time Lookup (PITL) query but don't run it. The PITL
query will be printed to stdout.
project:
The project to use for feature lookup and running the Point-In-Time
Lookup (PITL) query in BigQuery. If unset, the project set in
aiplatform.init will be used.
location:
The location to use for feature lookup and running the Point-In-Time
Lookup (PITL) query in BigQuery. If unset, the project set in
aiplatform.init will be used.
credentials:
Custom credentials to use for feature lookup and running the
Point-In-Time Lookup (PITL) query in BigQuery. Overrides credentials
set in aiplatform.init.
Returns:
A `bigframes.pandas.DataFrame` with the historical feature values. `None`
if in `dry_run` mode.
"""
bigframes = _try_import_bigframes()
project = project or initializer.global_config.project
location = location or initializer.global_config.location
credentials = credentials or initializer.global_config.credentials
application_name = (
f"vertexai-offline-store/{__version__}+fetch-historical-feature-values"
)
session_options = bigframes.BigQueryOptions(
credentials=credentials,
project=project,
location=location,
application_name=application_name,
)
session = bigframes.connect(session_options)
if feature_age_threshold is not None:
raise NotImplementedError("feature_age_threshold is not yet supported.")
if not features:
raise ValueError("Please specify a non-empty list of features.")
# Convert to bigframe if needed.
if not isinstance(entity_df, bigframes.pandas.DataFrame):
entity_df = _DataFrameToBigQueryDataFramesConverter.to_bigquery_dataframe(
df=entity_df,
session=session,
)
# Ensure one timestamp column is present in the entity DataFrame.
ts_cols = entity_df.select_dtypes(include=["datetime"]).columns
if len(ts_cols) > 1:
# TODO: Support multiple timestamp columns by specifying feature_timestamp column in an override.
raise ValueError(
'Multiple timestamp columns ("datetime" dtype) found in entity DataFrame. '
"Only one timestamp column is allowed. "
f"Timestamp columns: {', '.join([col for col in ts_cols])}"
)
elif len(ts_cols) == 0:
raise ValueError(
'No timestamp column ("datetime" dtype) found in entity DataFrame.'
)
entity_df_ts_col = ts_cols[0]
entity_df_non_ts_cols = [c for c in entity_df.columns if c != entity_df_ts_col]
entity_data_source = impl.DataSource(
qualifying_name="entity_df",
sql=entity_df.sql,
data_columns=entity_df_non_ts_cols,
timestamp_column=entity_df_ts_col,
)
feature_data: List[impl.DataSource] = []
for feature in features:
if isinstance(feature, Feature):
feature_group = _get_feature_group_from_feature(feature, credentials)
feature_data.append(_feature_to_data_source(feature_group, feature))
elif isinstance(feature, str):
feature_group, feature = _extract_feature_from_str_repr(
feature, credentials
)
feature_data.append(_feature_to_data_source(feature_group, feature))
else:
raise ValueError(
f"Unsupported feature type {type(feature)} found in feature list. Feature: {feature}"
)
# TODO: Verify `feature_data`.
# * Ensure that qualifying_names are not interfering.
# * Ensure that feature names are not interfering.
# * Ensure that entity id columns of all features are present in the entity DF.
query = impl.render_pitl_query(
entity_data=entity_data_source,
feature_data=feature_data,
)
if dry_run:
print("--- Dry run mode: PITL QUERY BEGIN ---")
print(query)
print("--- Dry run mode: PITL QUERY END ---")
return None
return session.read_gbq_query(
query,
index_col=bigframes.enums.DefaultIndexKind.NULL,
)

View File

@@ -0,0 +1,233 @@
# -*- coding: utf-8 -*-
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import abc
from dataclasses import dataclass
from dataclasses import field
import enum
from typing import Any, Dict, List, Optional, Union
from google.cloud.aiplatform.compat.types import (
feature_online_store_service as fos_service,
)
import proto
from typing_extensions import override
def get_feature_online_store_name(online_store_name: str) -> str:
"""Extract Feature Online Store's name from FeatureView's full resource name.
Args:
online_store_name: Full resource name is projects/project_number/
locations/us-central1/featureOnlineStores/fos_name/featureViews/fv_name
Returns:
str: feature online store name.
"""
arr = online_store_name.split("/")
return arr[5]
class PublicEndpointNotFoundError(RuntimeError):
"""Public endpoint has not been created yet."""
@dataclass
class FeatureViewBigQuerySource:
uri: str
entity_id_columns: List[str]
@dataclass
class FeatureViewVertexRagSource:
uri: str
rag_corpus_id: Optional[str] = None
@dataclass
class FeatureViewRegistrySource:
"""Configuration options for Feature View being registered with Feature Registry features.
Attributes:
features : Use `<feature_group_id>.<feature_id>` as
the format for each feature.
project_number : Optional. The project number of the project that owns the
Feature Registry if in a different project.
"""
features: List[str]
project_number: Optional[int] = None
@dataclass(frozen=True)
class ConnectionOptions:
"""Represents connection options used for sending RPCs to the online store."""
@dataclass(frozen=True)
class InsecureGrpcChannel:
"""Use an insecure gRPC channel to connect to the host."""
pass
host: str # IP address or DNS.
transport: Union[
InsecureGrpcChannel
] # Currently only insecure gRPC channel is supported.
def __eq__(self, other):
if self.host != other.host:
return False
if isinstance(self.transport, ConnectionOptions.InsecureGrpcChannel):
# Insecure grpc channel has no other parameters to check.
if isinstance(other.transport, ConnectionOptions.InsecureGrpcChannel):
return True
# Otherwise, can't compare against a different transport type.
raise ValueError(
f"Transport '{self.transport}' cannot be compared to transport '{other.transport}'."
)
# Currently only InsecureGrpcChannel is supported.
raise ValueError(f"Unsupported transport supplied: {self.transport}")
@dataclass
class FeatureViewReadResponse:
_response: fos_service.FetchFeatureValuesResponse
def __init__(self, response: fos_service.FetchFeatureValuesResponse):
self._response = response
def to_dict(self) -> Dict[str, Any]:
return proto.Message.to_dict(self._response.key_values)
def to_proto(self) -> fos_service.FetchFeatureValuesResponse:
return self._response
@dataclass
class SearchNearestEntitiesResponse:
_response: fos_service.SearchNearestEntitiesResponse
def __init__(self, response: fos_service.SearchNearestEntitiesResponse):
self._response = response
def to_dict(self) -> Dict[str, Any]:
return proto.Message.to_dict(self._response.nearest_neighbors)
def to_proto(self) -> fos_service.SearchNearestEntitiesResponse:
return self._response
class DistanceMeasureType(enum.Enum):
"""The distance measure used in nearest neighbor search."""
DISTANCE_MEASURE_TYPE_UNSPECIFIED = 0
# Euclidean (L_2) Distance.
SQUARED_L2_DISTANCE = 1
# Cosine Distance. Defined as 1 - cosine similarity.
COSINE_DISTANCE = 2
# Dot Product Distance. Defined as a negative of the dot product.
DOT_PRODUCT_DISTANCE = 3
class AlgorithmConfig(abc.ABC):
"""Base class for configuration options for matching algorithm."""
def as_dict(self) -> Dict:
"""Returns the configuration as a dictionary.
Returns:
Dict[str, Any]
"""
pass
@dataclass
class TreeAhConfig(AlgorithmConfig):
"""Configuration options for using the tree-AH algorithm (Shallow tree + Asymmetric Hashing).
Please refer to this paper for more details: https://arxiv.org/abs/1908.10396
Args:
leaf_node_embedding_count (int): Optional. Number of embeddings on each
leaf node. The default value is 1000 if not set.
"""
leaf_node_embedding_count: Optional[int] = None
@override
def as_dict(self) -> Dict:
return {"leaf_node_embedding_count": self.leaf_node_embedding_count}
@dataclass
class BruteForceConfig(AlgorithmConfig):
"""Configuration options for using brute force search.
It simply implements the standard linear search in the database for each
query.
"""
@override
def as_dict(self) -> Dict[str, Any]:
return {"bruteForceConfig": {}}
@dataclass
class IndexConfig:
"""Configuration options for the Vertex FeatureView for embedding."""
embedding_column: str
dimensions: int
algorithm_config: AlgorithmConfig = field(default_factory=TreeAhConfig())
filter_columns: Optional[List[str]] = None
crowding_column: Optional[str] = None
distance_measure_type: Optional[DistanceMeasureType] = None
def as_dict(self) -> Dict[str, Any]:
"""Returns the configuration as a dictionary.
Returns:
Dict[str, Any]
"""
config = {
"embedding_column": self.embedding_column,
"embedding_dimension": self.dimensions,
}
if self.distance_measure_type is not None:
config["distance_measure_type"] = self.distance_measure_type.value
if self.filter_columns is not None:
config["filter_columns"] = self.filter_columns
if self.crowding_column is not None:
config["crowding_column"] = self.crowding_column
if isinstance(self.algorithm_config, TreeAhConfig):
config["tree_ah_config"] = self.algorithm_config.as_dict()
else:
config["brute_force_config"] = self.algorithm_config.as_dict()
return config
@dataclass
class FeatureGroupBigQuerySource:
"""BigQuery source for the Feature Group."""
# The URI for the BigQuery table/view.
uri: str
# The entity ID columns. If not specified, defaults to ['entity_id'].
entity_id_columns: Optional[List[str]] = None

View File

@@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from vertexai.resources.preview.ml_monitoring.model_monitors import (
ModelMonitor,
ModelMonitoringJob,
)
__all__ = (
"ModelMonitor",
"ModelMonitoringJob",
)

View File

@@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from vertexai.resources.preview.ml_monitoring.spec.notification import (
NotificationSpec,
)
from vertexai.resources.preview.ml_monitoring.spec.objective import (
FeatureAttributionSpec,
DataDriftSpec,
MonitoringInput,
ObjectiveSpec,
TabularObjective,
)
from vertexai.resources.preview.ml_monitoring.spec.output import (
OutputSpec,
)
from vertexai.resources.preview.ml_monitoring.spec.schema import (
FieldSchema,
ModelMonitoringSchema,
)
__all__ = (
"NotificationSpec",
"OutputSpec",
"ObjectiveSpec",
"FeatureAttributionSpec",
"DataDriftSpec",
"MonitoringInput",
"TabularObjective",
"FieldSchema",
"ModelMonitoringSchema",
)

View File

@@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Optional, List
from google.cloud.aiplatform.compat.types import (
model_monitoring_spec_v1beta1 as model_monitoring_spec,
)
class NotificationSpec:
"""Initializer for NotificationSpec.
Args:
user_emails (List[str]):
Optional. The email addresses to send the alert to.
notification_channels (List[str]):
Optional. The notification channels to send the alert to.
Format: ``projects/{project}/notificationChannels/{channel}``
enable_cloud_logging (bool):
Optional. If dump the anomalies to Cloud Logging. The anomalies will
be put to json payload. This can be further sinked to Pub/Sub or any
other services supported by Cloud Logging.
"""
def __init__(
self,
user_emails: Optional[List[str]] = None,
notification_channels: Optional[List[str]] = None,
enable_cloud_logging: Optional[bool] = False,
):
self.user_emails = user_emails
self.notification_channels = notification_channels
self.enable_cloud_logging = enable_cloud_logging
def _as_proto(self) -> model_monitoring_spec.ModelMonitoringNotificationSpec:
"""Converts ModelMonitoringNotificationSpec to a proto message.
Returns:
The GAPIC representation of the notification alert config.
"""
user_email_config = None
if self.user_emails is not None:
user_email_config = (
model_monitoring_spec.ModelMonitoringNotificationSpec.EmailConfig(
user_emails=self.user_emails
)
)
user_notification_channel_config = []
if self.notification_channels:
for notification_channel in self.notification_channels:
user_notification_channel_config.append(
model_monitoring_spec.ModelMonitoringNotificationSpec.NotificationChannelConfig(
notification_channel=notification_channel
)
)
return model_monitoring_spec.ModelMonitoringNotificationSpec(
email_config=user_email_config,
notification_channel_configs=user_notification_channel_config,
enable_cloud_logging=self.enable_cloud_logging,
)

View File

@@ -0,0 +1,522 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Dict, List, Optional
from google.cloud.aiplatform.compat.types import (
explanation_v1beta1 as explanation,
machine_resources_v1beta1 as machine_resources,
model_monitoring_alert_v1beta1 as model_monitoring_alert,
model_monitoring_spec_v1beta1 as model_monitoring_spec,
)
from google.protobuf import timestamp_pb2
from google.type import interval_pb2
TF_RECORD = "tf-record"
CSV = "csv"
JSONL = "jsonl"
JENSEN_SHANNON_DIVERGENCE = "jensen_shannon_divergence"
L_INFINITY = "l_infinity"
SUPPORTED_NUMERIC_METRICS = [JENSEN_SHANNON_DIVERGENCE]
SUPPORTED_CATEGORICAL_METRICS = [JENSEN_SHANNON_DIVERGENCE, L_INFINITY]
class DataDriftSpec:
"""Data drift monitoring spec.
Data drift measures the distribution distance between the current dataset
and a baseline dataset. A typical use case is to detect data drift between
the recent production serving dataset and the training dataset, or to
compare the recent production dataset with a dataset from a previous period.
Example:
feature_drift_spec=DataDriftSpec(
features=["feature1"]
categorical_metric_type="l_infinity",
numeric_metric_type="jensen_shannon_divergence",
default_categorical_alert_threshold=0.01,
default_numeric_alert_threshold=0.02,
feature_alert_thresholds={"feature1":0.02, "feature2":0.01},
)
Attributes:
features (List[str]):
Optional. Feature names / Prediction output names interested in
monitoring. These should be a subset of the input feature names or
prediction output names specified in the monitoring schema.
If not specified, all features / prediction outputs outlied in the
monitoring schema will be used.
categorical_metric_type (str):
Optional. Supported metrics type: l_infinity, jensen_shannon_divergence
numeric_metric_type (str):
Optional. Supported metrics type: jensen_shannon_divergence
default_categorical_alert_threshold (float):
Optional. Default alert threshold for all the categorical features.
default_numeric_alert_threshold (float):
Optional. Default alert threshold for all the numeric features.
feature_alert_thresholds (Dict[str, float]):
Optional. Per feature alert threshold will override default alert
threshold.
"""
def __init__(
self,
features: Optional[List[str]] = None,
categorical_metric_type: Optional[str] = L_INFINITY,
numeric_metric_type: Optional[str] = JENSEN_SHANNON_DIVERGENCE,
default_categorical_alert_threshold: Optional[float] = None,
default_numeric_alert_threshold: Optional[float] = None,
feature_alert_thresholds: Optional[Dict[str, float]] = None,
):
self.features = features
self.categorical_metric_type = categorical_metric_type
self.numeric_metric_type = numeric_metric_type
self.default_categorical_alert_threshold = default_categorical_alert_threshold
self.default_numeric_alert_threshold = default_numeric_alert_threshold
self.feature_alert_thresholds = feature_alert_thresholds
def _as_proto(
self,
) -> model_monitoring_spec.ModelMonitoringObjectiveSpec.DataDriftSpec:
"""Converts DataDriftSpec to a proto message.
Returns:
The GAPIC representation of the data drift spec.
"""
user_default_categorical_alert_threshold = None
user_default_numeric_alert_threshold = None
user_alert_thresholds = None
user_features = None
if self.numeric_metric_type not in SUPPORTED_NUMERIC_METRICS:
raise ValueError(
f"The numeric metric type is not supported"
f" {self.numeric_metric_type}"
)
user_numeric_metric_type = self.numeric_metric_type
if self.categorical_metric_type not in SUPPORTED_CATEGORICAL_METRICS:
raise ValueError(
f"The categorical metric type is not supported"
f" {self.categorical_metric_type}"
)
user_categorical_metric_type = self.categorical_metric_type
if self.default_categorical_alert_threshold:
user_default_categorical_alert_threshold = (
model_monitoring_alert.ModelMonitoringAlertCondition(
threshold=self.default_categorical_alert_threshold
)
)
if self.default_numeric_alert_threshold:
user_default_numeric_alert_threshold = (
model_monitoring_alert.ModelMonitoringAlertCondition(
threshold=self.default_numeric_alert_threshold
)
)
if self.feature_alert_thresholds:
user_alert_thresholds = {}
for feature in self.feature_alert_thresholds:
user_alert_thresholds.update(
{
feature: model_monitoring_alert.ModelMonitoringAlertCondition(
threshold=self.feature_alert_thresholds[feature]
)
}
)
if self.features:
user_features = self.features
return model_monitoring_spec.ModelMonitoringObjectiveSpec.DataDriftSpec(
default_categorical_alert_condition=user_default_categorical_alert_threshold,
default_numeric_alert_condition=user_default_numeric_alert_threshold,
categorical_metric_type=user_categorical_metric_type,
numeric_metric_type=user_numeric_metric_type,
feature_alert_conditions=user_alert_thresholds,
features=user_features,
)
class FeatureAttributionSpec:
"""Feature attribution spec.
Example:
feature_attribution_spec=FeatureAttributionSpec(
features=["feature1"]
default_alert_threshold=0.01,
feature_alert_thresholds={"feature1":0.02, "feature2":0.01},
batch_dedicated_resources=BatchDedicatedResources(
starting_replica_count=1,
max_replica_count=2,
machine_spec=my_machine_spec,
),
)
Attributes:
features (List[str]):
Optional. Input feature names interested in monitoring. These should
be a subset of the input feature names specified in the monitoring
schema.
If not specified, all features outlied in the monitoring schema will
be used.
default_alert_threshold (float):
Optional. Default alert threshold for all the features.
feature_alert_thresholds (Dict[str, float]):
Optional. Per feature alert threshold will override default alert
threshold.
batch_dedicated_resources (machine_resources.BatchDedicatedResources):
Optional. The config of resources used by the Model Monitoring during
the batch explanation for non-AutoML models. If not set, `n1-standard-2`
machine type will be used by default.
"""
def __init__(
self,
features: Optional[List[str]] = None,
default_alert_threshold: Optional[float] = None,
feature_alert_thresholds: Optional[Dict[str, float]] = None,
batch_dedicated_resources: Optional[
machine_resources.BatchDedicatedResources
] = None,
):
self.features = features
self.default_alert_threshold = default_alert_threshold
self.feature_alert_thresholds = feature_alert_thresholds
self.batch_dedicated_resources = batch_dedicated_resources
def _as_proto(
self,
) -> model_monitoring_spec.ModelMonitoringObjectiveSpec.FeatureAttributionSpec:
"""Converts FeatureAttributionSpec to a proto message.
Returns:
The GAPIC representation of the feature attribution spec.
"""
user_default_alert_threshold = None
user_alert_thresholds = None
user_features = None
if self.default_alert_threshold:
user_default_alert_threshold = (
model_monitoring_alert.ModelMonitoringAlertCondition(
threshold=self.default_alert_threshold
)
)
if self.feature_alert_thresholds:
user_alert_thresholds = {}
for feature in self.feature_alert_thresholds:
user_alert_thresholds.update(
{
feature: model_monitoring_alert.ModelMonitoringAlertCondition(
threshold=self.feature_alert_thresholds[feature]
)
}
)
if self.features:
user_features = self.features
return (
model_monitoring_spec.ModelMonitoringObjectiveSpec.FeatureAttributionSpec(
default_alert_condition=user_default_alert_threshold,
feature_alert_conditions=user_alert_thresholds,
features=user_features,
batch_explanation_dedicated_resources=self.batch_dedicated_resources,
)
)
class MonitoringInput:
"""Model monitoring data input spec.
Attributes:
vertex_dataset (str):
Optional. Resource name of the Vertex AI managed dataset.
Format: ``projects/{project}/locations/{location}/datasets/{dataset}``
At least one source of dataset should be provided, and if one of the
fields is set, no need to set other sources
(vertex_dataset, gcs_uri, table_uri, query, batch_prediction_job,
endpoints).
gcs_uri (str):
Optional. Google Cloud Storage URI to the input file(s). May contain
wildcards.
data_format (str):
Optional. Data format of Google Cloud Storage file(s). Should be
provided if a gcs_uri is set.
Supported formats:
"csv", "jsonl", "tf-record"
table_uri (str):
Optonal. BigQuery URI to a table, up to 2000 characters long.
All the columns in the table will be selected. Accepted forms:
- BigQuery path. For example:
``bq://projectId.bqDatasetId.bqTableId``.
query (str):
Optional. Standard SQL for BigQuery to be used instead of the
``table_uri``.
timestamp_field (str):
Optional. The timestamp field in the dataset.
the ``timestamp_field`` must be specified if you'd like to use
``start_time``, ``end_time``, ``offset`` or ``window``.
If you use ``query`` to specify the dataset, make sure the
``timestamp_field`` is in the selection fields.
batch_prediction_job (str):
Optional. Vertex AI Batch Prediction Job resource name.
Format: ``projects/{project}/locations/{location}/batchPredictionJobs/{batch_prediction_job}``
endpoints (List[str]):
Optional. List of Vertex AI Endpoint resource names.
Format: ``projects/{project}/locations/{location}/endpoints/{endpoint}``
start_time (timestamp_pb2.Timestamp):
Optional. Inclusive start of the time interval for which results
should be returned. Should be set together with ``end_time``.
end_time (timestamp_pb2.Timestamp):
Optional. Exclusive end of the time interval for which results
should be returned. Should be set together with ``start_time`.`
offset (str):
Optional. Offset is the time difference from the cut-off time.
For scheduled jobs, the cut-off time is the scheduled time.
For non-scheduled jobs, it's the time when the job was created.
Currently we support the following format:
'w|W': Week, 'd|D': Day, 'h|H': Hour
E.g. '1h' stands for 1 hour, '2d' stands for 2 days.
window (str):
Optional. Window refers to the scope of data selected for analysis.
It allows you to specify the quantity of data you wish to examine.
It refers to the data time window prior to the cut-off time or the
cut-off time minus the offset.
Currently we support the following format:
'w|W': Week, 'd|D': Day, 'h|H': Hour
E.g. '1h' stands for 1 hour, '2d' stands for 2 days.
"""
def __init__(
self,
vertex_dataset: Optional[str] = None,
gcs_uri: Optional[str] = None,
data_format: Optional[str] = None,
table_uri: Optional[str] = None,
query: Optional[str] = None,
timestamp_field: Optional[str] = None,
batch_prediction_job: Optional[str] = None,
endpoints: Optional[List[str]] = None,
start_time: Optional[timestamp_pb2.Timestamp] = None,
end_time: Optional[timestamp_pb2.Timestamp] = None,
offset: Optional[str] = None,
window: Optional[str] = None,
):
self.vertex_dataset = vertex_dataset
self.gcs_uri = gcs_uri
self.data_format = data_format
self.table_uri = table_uri
self.query = query
self.timestamp_field = timestamp_field
self.batch_prediction_job = batch_prediction_job
self.endpoints = endpoints
self.start_time = start_time
self.end_time = end_time
self.offset = offset
self.window = window
def _as_proto(self) -> model_monitoring_spec.ModelMonitoringInput:
"""Converts ModelMonitoringInput to a proto message.
Returns:
The GAPIC representation of the model monitoring input.
"""
user_time_interval = None
user_time_spec = None
if self.offset or self.window:
user_time_spec = model_monitoring_spec.ModelMonitoringInput.TimeOffset(
offset=self.offset if self.offset else None,
window=self.window if self.window else None,
)
elif self.start_time or self.end_time:
user_time_interval = interval_pb2.Interval(
start_time=self.start_time if self.start_time else None,
end_time=self.end_time if self.end_time else None,
)
if self.vertex_dataset or self.gcs_uri or self.table_uri or self.query:
user_vertex_dataset = None
user_gcs_source = None
user_bigquery_source = None
if self.vertex_dataset:
user_vertex_dataset = self.vertex_dataset
elif self.gcs_uri:
if not self.data_format:
raise ValueError("`data_format` must be provided with gcs uri.")
if self.data_format == CSV:
user_data_format = (
model_monitoring_spec.ModelMonitoringInput.ModelMonitoringDataset.ModelMonitoringGcsSource.DataFormat.CSV
)
elif self.data_format == JSONL:
user_data_format = (
model_monitoring_spec.ModelMonitoringInput.ModelMonitoringDataset.ModelMonitoringGcsSource.DataFormat.JSONL
)
elif self.data_format == TF_RECORD:
user_data_format = (
model_monitoring_spec.ModelMonitoringInput.ModelMonitoringDataset.ModelMonitoringGcsSource.DataFormat.TF_RECORD
)
else:
raise ValueError(
(
"Unsupported value in data format. `data_format` "
"must be one of %s, %s, or %s"
)
% (TF_RECORD, CSV, JSONL)
)
user_gcs_source = model_monitoring_spec.ModelMonitoringInput.ModelMonitoringDataset.ModelMonitoringGcsSource(
gcs_uri=self.gcs_uri,
format_=user_data_format,
)
elif self.table_uri or self.query:
user_bigquery_source = model_monitoring_spec.ModelMonitoringInput.ModelMonitoringDataset.ModelMonitoringBigQuerySource(
table_uri=self.table_uri,
query=self.query,
)
else:
raise ValueError(
("At least one source of dataset must" " be provided.")
)
user_model_monitoring_dataset = (
model_monitoring_spec.ModelMonitoringInput.ModelMonitoringDataset(
vertex_dataset=user_vertex_dataset,
gcs_source=user_gcs_source,
bigquery_source=user_bigquery_source,
timestamp_field=self.timestamp_field,
)
)
return model_monitoring_spec.ModelMonitoringInput(
columnized_dataset=user_model_monitoring_dataset,
time_offset=user_time_spec,
time_interval=user_time_interval,
)
elif self.batch_prediction_job:
user_batch_prediction_output = (
model_monitoring_spec.ModelMonitoringInput.BatchPredictionOutput(
batch_prediction_job=self.batch_prediction_job,
)
)
return model_monitoring_spec.ModelMonitoringInput(
batch_prediction_output=user_batch_prediction_output,
time_offset=user_time_spec,
time_interval=user_time_interval,
)
elif self.endpoints:
user_vertex_endpoint_logs = (
model_monitoring_spec.ModelMonitoringInput.VertexEndpointLogs(
endpoints=self.endpoints,
)
)
return model_monitoring_spec.ModelMonitoringInput(
vertex_endpoint_logs=user_vertex_endpoint_logs,
time_offset=user_time_spec,
time_interval=user_time_interval,
)
else:
raise ValueError("At least one source of dataInput must be provided.")
class TabularObjective:
"""Initializer for TabularObjective.
Attributes:
feature_drift_spec (DataDriftSpec):
Optional. Input feature distribution drift monitoring spec.
prediction_output_drift_spec (DataDriftSpec):
Optional. Prediction output distribution drift monitoring spec.
feature_attribution_spec (FeatureAttributionSpec):
Optional. Feature attribution monitoring spec.
"""
def __init__(
self,
feature_drift_spec: Optional[DataDriftSpec] = None,
prediction_output_drift_spec: Optional[DataDriftSpec] = None,
feature_attribution_spec: Optional[FeatureAttributionSpec] = None,
):
self.feature_drift_spec = feature_drift_spec
self.prediction_output_drift_spec = prediction_output_drift_spec
self.feature_attribution_spec = feature_attribution_spec
def _as_proto(
self,
) -> model_monitoring_spec.ModelMonitoringObjectiveSpec.TabularObjective:
"""Converts TabularObjective to a proto message.
Returns:
The GAPIC representation of the model monitoring tabular objective.
"""
user_feature_drift_spec = None
user_prediction_output_drift_spec = None
user_feature_attribution_spec = None
if self.feature_drift_spec:
user_feature_drift_spec = self.feature_drift_spec._as_proto()
if self.prediction_output_drift_spec:
user_prediction_output_drift_spec = (
self.prediction_output_drift_spec._as_proto()
)
if self.feature_attribution_spec:
user_feature_attribution_spec = self.feature_attribution_spec._as_proto()
return model_monitoring_spec.ModelMonitoringObjectiveSpec.TabularObjective(
feature_drift_spec=user_feature_drift_spec,
prediction_output_drift_spec=user_prediction_output_drift_spec,
feature_attribution_spec=user_feature_attribution_spec,
)
class ObjectiveSpec:
"""Initializer for ObjectiveSpec.
Args:
baseline_dataset (MonitoringInput):
Required. Baseline datasets that are used by all the monitoring
objectives. It could be the training dataset or production serving
dataset from a previous period.
target_dataset (MonitoringInput):
Required. Target dataset for monitoring analysis, it's used by all
the monitoring objectives.
tabular_objective (TabularObjective):
Optional. The tabular monitoring objective.
explanation_spec (explanation.ExplanationSpec):
Optional. The explanation spec. This spec is required when the
objectives spec includes feature attribution objectives.
"""
def __init__(
self,
baseline_dataset: MonitoringInput,
target_dataset: MonitoringInput,
tabular_objective: Optional[TabularObjective] = None,
explanation_spec: Optional[explanation.ExplanationSpec] = None,
):
self.baseline = baseline_dataset
self.target = target_dataset
self.tabular_objective = tabular_objective
self.explanation_spec = explanation_spec
def _as_proto(self) -> model_monitoring_spec.ModelMonitoringObjectiveSpec:
"""Converts ModelMonitoringObjectiveSpec to a proto message.
Returns:
The GAPIC representation of the model monitoring objective config.
"""
user_tabular_objective = None
if not self.baseline or not self.target:
raise ValueError("At least one objective must be provided.")
if self.tabular_objective:
user_tabular_objective = self.tabular_objective._as_proto()
return model_monitoring_spec.ModelMonitoringObjectiveSpec(
tabular_objective=user_tabular_objective,
explanation_spec=self.explanation_spec if self.explanation_spec else None,
target_dataset=self.target._as_proto(),
baseline_dataset=self.baseline._as_proto(),
)

View File

@@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from google.cloud.aiplatform.compat.types import (
io_v1beta1 as io,
model_monitoring_spec_v1beta1 as model_monitoring_spec,
)
class OutputSpec:
"""Initializer for OutputSpec.
Args:
data_source (str):
Optional. Google Cloud Storage base folder path for metrics, error
logs, etc.
"""
def __init__(
self,
gcs_base_dir: str,
):
self.gcs_base_dir = gcs_base_dir
def _as_proto(self) -> model_monitoring_spec.ModelMonitoringOutputSpec:
"""Converts ModelMonitoringOutputSpec to a proto message.
Returns:
The GAPIC representation of the notification alert config.
"""
user_gcs_base_dir = io.GcsDestination(output_uri_prefix=self.gcs_base_dir)
return model_monitoring_spec.ModelMonitoringOutputSpec(
gcs_base_directory=user_gcs_base_dir,
)

View File

@@ -0,0 +1,441 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import json
import logging
import os
from typing import Dict, List, MutableSequence, Optional
from google.cloud import bigquery
from google.cloud.aiplatform.compat.types import (
model_monitor_v1beta1 as model_monitor,
)
try:
import pandas as pd
except ImportError:
pd = None
try:
import tensorflow as tf
except ImportError:
tf = None
class FieldSchema:
"""Field Schema.
The class identifies the data type of a single feature,
which combines together to form the Schema for different fields in
ModelMonitoringSchema.
Attributes:
name (str):
Required. Field name.
data_type (str):
Required. Supported data types are: ``float``, ``integer``
``boolean``, ``string``, ``categorical``.
repeated (bool):
Optional. Describes if the schema field is an array of given data
type.
"""
def __init__(
self,
name: str,
data_type: str,
repeated: Optional[bool] = False,
):
self.name = name
self.data_type = data_type
self.repeated = repeated
def _as_proto(self) -> model_monitor.ModelMonitoringSchema.FieldSchema:
"""Converts ModelMonitoringSchema.FieldSchema to a proto message.
Returns:
The GAPIC representation of the model monitoring field schema.
"""
return model_monitor.ModelMonitoringSchema.FieldSchema(
name=self.name,
data_type=self.data_type,
repeated=self.repeated,
)
class ModelMonitoringSchema:
"""Initializer for ModelMonitoringSchema.
Args:
feature_fields (MutableSequence[FieldSchema]):
Required. Feature names of the model. Vertex AI will try to match
the features from your dataset as follows:
* For 'csv' files, the header names are required, and we will
extract thecorresponding feature values when the header names
align with the feature names.
* For 'jsonl' files, we will extract the corresponding feature
values if the key names match the feature names. Note: Nested
features are not supported, so please ensure your features are
flattened. Ensure the feature values are scalar or an array of
scalars.
* For 'bigquery' dataset, we will extract the corresponding feature
values if the column names match the feature names.
Note: The column type can be a scalar or an array of scalars.
STRUCT or JSON types are not supported. You may use SQL queries to
select or aggregate the relevant features from your original
table. However, ensure that the 'schema' of the query results
meets our requirements.
* For the Vertex AI Endpoint Request Response Logging table or
Vertex AI Batch Prediction Job results. If the prediction
instance format is an array, ensure that the sequence in
``feature_fields`` matches the order of features in the prediction
instance. We will match the feature with the array in the order
specified in ``feature_fields``.
prediction_fields (MutableSequence[FieldSchema]):
Optional. Prediction output names of the model. The requirements are
the same as the ``feature_fields``.
For AutoML Tables, the prediction output name presented in schema
will be: `predicted_{target_column}`, the `target_column` is the one
you specified when you train the model.
For Prediction output drift analysis:
* AutoML Classification, the distribution of the argmax label will
be analyzed.
* AutoML Regression, the distribution of the value will be analyzed.
ground_truth_fields (MutableSequence[FieldSchema]):
Optional. Target /ground truth names of the model.
"""
def __init__(
self,
feature_fields: MutableSequence[FieldSchema],
ground_truth_fields: Optional[MutableSequence[FieldSchema]] = None,
prediction_fields: Optional[MutableSequence[FieldSchema]] = None,
):
self.feature_fields = feature_fields
self.prediction_fields = prediction_fields
self.ground_truth_fields = ground_truth_fields
def _as_proto(self) -> model_monitor.ModelMonitoringSchema:
"""Converts ModelMonitoringSchema to a proto message.
Returns:
The GAPIC representation of the model monitoring schema.
"""
user_feature_fields = list()
user_prediction_fields = list()
user_ground_truth_fields = list()
for field in self.feature_fields:
user_feature_fields.append(field._as_proto())
if self.prediction_fields:
for field in self.prediction_fields:
user_prediction_fields.append(field._as_proto())
if self.ground_truth_fields:
for field in self.ground_truth_fields:
user_ground_truth_fields.append(field._as_proto())
return model_monitor.ModelMonitoringSchema(
feature_fields=user_feature_fields,
prediction_fields=user_prediction_fields
if self.prediction_fields
else None,
ground_truth_fields=user_ground_truth_fields
if self.ground_truth_fields
else None,
)
def to_json(self, output_dir: Optional[str] = None) -> str:
"""Transform ModelMonitoringSchema to json format.
Args:
output_dir (str):
Optional. The output directory that the transformed json file
would be put into.
"""
result = model_monitor.ModelMonitoringSchema.to_json(self._as_proto())
if output_dir:
result_path = os.path.join(output_dir, "model_monitoring_schema.json")
with tf.io.gfile.GFile(result_path, "w") as f:
json.dump(result, f)
f.close()
logging.info("Transformed schema to json file: %s", result_path)
return result
def _check_duplicate(
field: str,
feature_fields: Optional[List[str]] = None,
ground_truth_fields: Optional[List[str]] = None,
prediction_fields: Optional[List[str]] = None,
) -> bool:
"""Check if a field appears in two field lists."""
feature = True
ground_truth = True
prediction = True
if not feature_fields or field not in feature_fields:
feature = False
if not ground_truth_fields or field not in ground_truth_fields:
ground_truth = False
if not prediction_fields or field not in prediction_fields:
prediction = False
return feature if (feature == ground_truth) else prediction
def _transform_schema_pandas(
dataset: Dict[str, str],
feature_fields: Optional[List[str]] = None,
ground_truth_fields: Optional[List[str]] = None,
prediction_fields: Optional[List[str]] = None,
) -> ModelMonitoringSchema:
"""Transforms the pandas schema to model monitoring schema."""
ground_truth_fields_list = list()
prediction_fields_list = list()
feature_fields_list = list()
pandas_integer_types = ["integer", "Int32", "Int64", "UInt32", "UInt64"]
pandas_string_types = [
"string",
"bytes",
"date",
"time",
"datetime64",
"datetime",
"mixed-integer",
"inteval",
"Interval",
]
pandas_float_types = [
"floating",
"decimal",
"mixed-integer-float",
"Float32",
"Float64",
]
for field in dataset:
infer_type = dataset[field]
if infer_type in pandas_string_types:
data_type = "string"
elif infer_type in pandas_integer_types:
data_type = "integer"
elif infer_type in pandas_float_types:
data_type = "float"
elif infer_type == "boolean":
data_type = "boolean"
elif infer_type == "categorical" or infer_type == "category":
data_type = "categorical"
else:
raise ValueError(f"Unsupported data type: {infer_type}")
if _check_duplicate(
field, feature_fields, ground_truth_fields, prediction_fields
):
raise ValueError(f"The field {field} specified in two or more field lists")
if ground_truth_fields and field in ground_truth_fields:
ground_truth_fields_list.append(
FieldSchema(
name=field,
data_type=data_type,
)
)
elif prediction_fields and field in prediction_fields:
prediction_fields_list.append(
FieldSchema(
name=field,
data_type=data_type,
)
)
elif (feature_fields and field in feature_fields) or not feature_fields:
feature_fields_list.append(
FieldSchema(
name=field,
data_type=data_type,
)
)
return ModelMonitoringSchema(
ground_truth_fields=ground_truth_fields_list if ground_truth_fields else None,
prediction_fields=prediction_fields_list if prediction_fields else None,
feature_fields=feature_fields_list,
)
def transform_schema_from_bigquery(
feature_fields: Optional[List[str]] = None,
ground_truth_fields: Optional[List[str]] = None,
prediction_fields: Optional[List[str]] = None,
table: Optional[str] = None,
query: Optional[str] = None,
) -> ModelMonitoringSchema:
"""Transform the existing dataset to ModelMonitoringSchema as model monitor
could accept.
Args:
feature_fields (List[str]):
Optional. The input feature fields for given dataset.
By default all features we find would be the input features.
ground_truth_fields (List[str]):
Optional. The ground truth fields for given dataset.
By default all features we find would be the input features.
prediction_fields (List[str]):
Optional. The prediction output field for given dataset.
By default all features we find would be the input features.
table (str):
Optional. The BigQuery table uri.
query (str):
Optional. The BigQuery query.
"""
ground_truth_fields_list = list()
prediction_fields_list = list()
feature_fields_list = list()
bq_string_types = [
"STRING",
"BYTES",
"DATE",
"TIME",
"GEOGRAPHY",
"DATETIME",
"JSON",
"INTEVAL",
"RANGE",
]
bq_integer_types = ["INTEGER", "INT64", "TIMESTAMP"]
bq_float_types = ["FLOAT", "DOUBLE", "FLOAT64", "NUMERIC", "BIGNUMERIC"]
if table:
if table.startswith("bq://"):
table = table[len("bq://") :]
try:
client = bigquery.Client()
table = client.get_table(table)
bq_schema = table.schema
except Exception as e:
raise ValueError("Failed to get table from bq address provided.") from e
elif query:
try:
client = bigquery.Client()
bq_schema = client.query(
query=query, job_config=bigquery.job.QueryJobConfig(dry_run=True)
).schema
except Exception as e:
raise ValueError("Failed to get query from bq address provided.") from e
else:
raise ValueError("Either table or query must be provided.")
for field in bq_schema:
if field.field_type in bq_string_types:
data_type = "string"
elif field.field_type in bq_integer_types:
data_type = "integer"
elif field.field_type in bq_float_types:
data_type = "float"
elif field.field_type == "BOOLEAN" or field.field_type == "BOOL":
data_type = "boolean"
else:
raise ValueError(f"Unsupported data type: {field.field_type}")
if _check_duplicate(
field.name, feature_fields, ground_truth_fields, prediction_fields
):
raise ValueError(
f"The field {field.name} specified in two or more field lists"
)
if ground_truth_fields and field.name in ground_truth_fields:
ground_truth_fields_list.append(
FieldSchema(
name=field.name,
data_type=data_type,
repeated=True if field.mode == "REPEATED" else False,
)
)
elif prediction_fields and field.name in prediction_fields:
prediction_fields_list.append(
FieldSchema(
name=field.name,
data_type=data_type,
repeated=True if field.mode == "REPEATED" else False,
)
)
elif (feature_fields and field.name in feature_fields) or not feature_fields:
feature_fields_list.append(
FieldSchema(
name=field.name,
data_type=data_type,
repeated=True if field.mode == "REPEATED" else False,
)
)
return ModelMonitoringSchema(
ground_truth_fields=ground_truth_fields_list if ground_truth_fields else None,
prediction_fields=prediction_fields_list if prediction_fields else None,
feature_fields=feature_fields_list,
)
def transform_schema_from_csv(
file_path: str,
feature_fields: Optional[List[str]] = None,
ground_truth_fields: Optional[List[str]] = None,
prediction_fields: Optional[List[str]] = None,
) -> ModelMonitoringSchema:
"""Transform the existing dataset to ModelMonitoringSchema as model monitor could accept.
Args:
file_path (str):
Required. The dataset file path.
feature_fields (List[str]):
Optional. The input feature fields for given dataset.
By default all features we find would be the input features.
ground_truth_fields (List[str]):
Optional. The ground truth fields for given dataset.
By default all features we find would be the input features.
prediction_fields (List[str]):s
Optional. The prediction output field for given dataset.
By default all features we find would be the input features.
"""
with tf.io.gfile.GFile(file_path, "r") as f:
input_dataset = pd.read_csv(f)
dict_dataset = dict()
for field in input_dataset.columns:
dict_dataset[field] = input_dataset.convert_dtypes().dtypes[field]
monitoring_schema = _transform_schema_pandas(
dict_dataset, feature_fields, ground_truth_fields, prediction_fields
)
f.close()
return monitoring_schema
def transform_schema_from_json(
file_path: str,
feature_fields: Optional[List[str]] = None,
ground_truth_fields: Optional[List[str]] = None,
prediction_fields: Optional[List[str]] = None,
) -> ModelMonitoringSchema:
"""Transform the existing dataset to ModelMonitoringSchema as model monitor
could accept.
Args:
file_path (str):
Required. The dataset file path.
feature_fields (List[str]):
Optional. The input feature fields for given dataset.
By default all features we find would be the input features.
ground_truth_fields (List[str]):
Optional. The ground truth fields for given dataset.
By default all features we find would be the input features.
prediction_fields (List[str]):
Optional. The prediction output field for given dataset.
By default all features we find would be the input features.
"""
with tf.io.gfile.GFile(file_path, "r") as f:
input_dataset = pd.read_json(f, lines=True)
dict_dataset = dict()
for field in input_dataset.columns:
dict_dataset[field] = input_dataset.convert_dtypes().dtypes[field]
monitoring_schema = _transform_schema_pandas(
dict_dataset, feature_fields, ground_truth_fields, prediction_fields
)
f.close()
return monitoring_schema