structure saas with tools
This commit is contained in:
192
.venv/lib/python3.10/site-packages/vertex_ray/data.py
Normal file
192
.venv/lib/python3.10/site-packages/vertex_ray/data.py
Normal file
@@ -0,0 +1,192 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2024 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import warnings
|
||||
import ray.data
|
||||
from ray.data.dataset import Dataset
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from google.cloud.aiplatform.vertex_ray.bigquery_datasource import (
|
||||
_BigQueryDatasource,
|
||||
)
|
||||
|
||||
try:
|
||||
from google.cloud.aiplatform.vertex_ray.bigquery_datasink import (
|
||||
_BigQueryDatasink,
|
||||
)
|
||||
except ImportError:
|
||||
_BigQueryDatasink = None
|
||||
|
||||
from google.cloud.aiplatform.vertex_ray.util._validation_utils import (
|
||||
_V2_4_WARNING_MESSAGE,
|
||||
_V2_9_WARNING_MESSAGE,
|
||||
)
|
||||
|
||||
|
||||
def read_bigquery(
|
||||
project_id: Optional[str] = None,
|
||||
dataset: Optional[str] = None,
|
||||
query: Optional[str] = None,
|
||||
*,
|
||||
parallelism: int = -1,
|
||||
ray_remote_args: Dict[str, Any] = None,
|
||||
concurrency: Optional[int] = None,
|
||||
override_num_blocks: Optional[int] = None,
|
||||
) -> Dataset:
|
||||
"""Create a dataset from BigQuery.
|
||||
|
||||
The data to read from is specified via the ``project_id``, ``dataset``
|
||||
and/or ``query`` parameters.
|
||||
|
||||
Args:
|
||||
project_id: The name of the associated Google Cloud Project that hosts
|
||||
the dataset to read.
|
||||
dataset: The name of the dataset hosted in BigQuery in the format of
|
||||
``dataset_id.table_id``. Both the dataset_id and table_id must exist
|
||||
otherwise an exception will be raised.
|
||||
query: The query to execute. The dataset is created from the results of
|
||||
executing the query if provided. Otherwise, the entire dataset is read.
|
||||
For query syntax guidelines, see
|
||||
https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax
|
||||
parallelism: 2.33.0, 2.42.0: This argument is deprecated. Use
|
||||
``override_num_blocks`` argument. 2.9.3: The requested parallelism of
|
||||
the read. If -1, it will be automatically chosen based on the available
|
||||
cluster resources and estimated in-memory data size.
|
||||
ray_remote_args: kwargs passed to ray.remote in the read tasks.
|
||||
concurrency: Supported for 2.33.0 and 2.42.0 only: The maximum number of
|
||||
Ray tasks to run concurrently. Set this to control number of tasks to
|
||||
run concurrently. This doesn't change the total number of tasks run or
|
||||
the total number of output blocks. By default, concurrency is
|
||||
dynamically decided based on the available resources.
|
||||
override_num_blocks: Supported for 2.33.0 and 2.42.0 only: Override the
|
||||
number of output blocks from all read tasks. By default, the number of
|
||||
output blocks is dynamically decided based on input data size and
|
||||
available resources. You shouldn't manually set this value in most
|
||||
cases.
|
||||
|
||||
Returns:
|
||||
Dataset producing rows from the results of executing the query
|
||||
or reading the entire dataset on the specified BigQuery dataset.
|
||||
"""
|
||||
datasource = _BigQueryDatasource(
|
||||
project_id=project_id,
|
||||
dataset=dataset,
|
||||
query=query,
|
||||
)
|
||||
|
||||
if ray.__version__ == "2.9.3":
|
||||
warnings.warn(_V2_9_WARNING_MESSAGE, DeprecationWarning, stacklevel=1)
|
||||
# Concurrency and override_num_blocks are not supported in 2.9.3
|
||||
return ray.data.read_datasource(
|
||||
datasource=datasource,
|
||||
parallelism=parallelism,
|
||||
ray_remote_args=ray_remote_args,
|
||||
)
|
||||
elif ray.__version__ in ("2.33.0", "2.42.0"):
|
||||
return ray.data.read_datasource(
|
||||
datasource=datasource,
|
||||
parallelism=parallelism,
|
||||
ray_remote_args=ray_remote_args,
|
||||
concurrency=concurrency,
|
||||
override_num_blocks=override_num_blocks,
|
||||
)
|
||||
else:
|
||||
raise ImportError(
|
||||
f"[Ray on Vertex AI]: Unsupported version {ray.__version__}."
|
||||
+ "Only 2.42.0, 2.33.0, and 2.9.3 are supported."
|
||||
)
|
||||
|
||||
|
||||
def write_bigquery(
|
||||
ds: Dataset,
|
||||
project_id: Optional[str] = None,
|
||||
dataset: Optional[str] = None,
|
||||
max_retry_cnt: int = 10,
|
||||
ray_remote_args: Dict[str, Any] = None,
|
||||
overwrite_table: Optional[bool] = True,
|
||||
concurrency: Optional[int] = None,
|
||||
) -> Any:
|
||||
"""Write the dataset to a BigQuery dataset table.
|
||||
|
||||
Args:
|
||||
ds: The dataset to write.
|
||||
project_id: The name of the associated Google Cloud Project that hosts
|
||||
the dataset table to write to.
|
||||
dataset: The name of the dataset table hosted in BigQuery in the format of
|
||||
``dataset_id.table_id``.
|
||||
The dataset table is created if it doesn't already exist.
|
||||
In 2.9.3, the table_id is overwritten if it exists.
|
||||
max_retry_cnt: The maximum number of retries that an individual block write
|
||||
is retried due to BigQuery rate limiting errors.
|
||||
The default number of retries is 10.
|
||||
ray_remote_args: kwargs passed to ray.remote in the write tasks.
|
||||
overwrite_table: Not supported in 2.9.3.
|
||||
2.33.0, 2.42.0: Whether the write will overwrite the table if it already
|
||||
exists. The default behavior is to overwrite the table.
|
||||
If false, will append to the table if it exists.
|
||||
concurrency: Not supported in 2.9.3.
|
||||
2.33.0, 2.42.0: The maximum number of Ray tasks to run concurrently. Set this
|
||||
to control number of tasks to run concurrently. This doesn't change the
|
||||
total number of tasks run or the total number of output blocks. By default,
|
||||
concurrency is dynamically decided based on the available resources.
|
||||
"""
|
||||
if ray.__version__ == "2.4.0":
|
||||
raise RuntimeError(_V2_4_WARNING_MESSAGE)
|
||||
|
||||
elif ray.__version__ in ("2.9.3", "2.33.0", "2.42.0"):
|
||||
if ray.__version__ == "2.9.3":
|
||||
warnings.warn(_V2_9_WARNING_MESSAGE, DeprecationWarning, stacklevel=1)
|
||||
if ray_remote_args is None:
|
||||
ray_remote_args = {}
|
||||
|
||||
# Each write task will launch individual remote tasks to write each block
|
||||
# To avoid duplicate block writes, the write task should not be retried
|
||||
if ray_remote_args.get("max_retries", 0) != 0:
|
||||
print(
|
||||
"[Ray on Vertex AI]: The max_retries of a BigQuery Write "
|
||||
"Task should be set to 0 to avoid duplicate writes."
|
||||
)
|
||||
else:
|
||||
ray_remote_args["max_retries"] = 0
|
||||
|
||||
if ray.__version__ == "2.9.3":
|
||||
# Concurrency and overwrite_table are not supported in 2.9.3
|
||||
datasink = _BigQueryDatasink(
|
||||
project_id=project_id,
|
||||
dataset=dataset,
|
||||
max_retry_cnt=max_retry_cnt,
|
||||
)
|
||||
return ds.write_datasink(
|
||||
datasink=datasink,
|
||||
ray_remote_args=ray_remote_args,
|
||||
)
|
||||
elif ray.__version__ in ("2.33.0", "2.42.0"):
|
||||
datasink = _BigQueryDatasink(
|
||||
project_id=project_id,
|
||||
dataset=dataset,
|
||||
max_retry_cnt=max_retry_cnt,
|
||||
overwrite_table=overwrite_table,
|
||||
)
|
||||
return ds.write_datasink(
|
||||
datasink=datasink,
|
||||
ray_remote_args=ray_remote_args,
|
||||
concurrency=concurrency,
|
||||
)
|
||||
else:
|
||||
raise ImportError(
|
||||
f"[Ray on Vertex AI]: Unsupported version {ray.__version__}."
|
||||
+ "Only 2.42.0, 2.33.0 and 2.9.3 are supported."
|
||||
)
|
||||
Reference in New Issue
Block a user