structure saas with tools

This commit is contained in:
Davidson Gomes
2025-04-25 15:30:54 -03:00
commit 1aef473937
16434 changed files with 6584257 additions and 0 deletions

View File

@@ -0,0 +1,249 @@
# Copyright 2015 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Google BigQuery API wrapper.
The main concepts with this API are:
- :class:`~google.cloud.bigquery.client.Client` manages connections to the
BigQuery API. Use the client methods to run jobs (such as a
:class:`~google.cloud.bigquery.job.QueryJob` via
:meth:`~google.cloud.bigquery.client.Client.query`) and manage resources.
- :class:`~google.cloud.bigquery.dataset.Dataset` represents a
collection of tables.
- :class:`~google.cloud.bigquery.table.Table` represents a single "relation".
"""
import warnings
from google.cloud.bigquery import version as bigquery_version
__version__ = bigquery_version.__version__
from google.cloud.bigquery.client import Client
from google.cloud.bigquery.dataset import AccessEntry
from google.cloud.bigquery.dataset import Dataset
from google.cloud.bigquery.dataset import DatasetReference
from google.cloud.bigquery import enums
from google.cloud.bigquery.enums import AutoRowIDs
from google.cloud.bigquery.enums import DecimalTargetType
from google.cloud.bigquery.enums import KeyResultStatementKind
from google.cloud.bigquery.enums import SqlTypeNames
from google.cloud.bigquery.enums import StandardSqlTypeNames
from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
from google.cloud.bigquery.exceptions import LegacyPandasError
from google.cloud.bigquery.exceptions import LegacyPyarrowError
from google.cloud.bigquery.external_config import ExternalConfig
from google.cloud.bigquery.external_config import BigtableOptions
from google.cloud.bigquery.external_config import BigtableColumnFamily
from google.cloud.bigquery.external_config import BigtableColumn
from google.cloud.bigquery.external_config import CSVOptions
from google.cloud.bigquery.external_config import GoogleSheetsOptions
from google.cloud.bigquery.external_config import ExternalSourceFormat
from google.cloud.bigquery.external_config import HivePartitioningOptions
from google.cloud.bigquery.format_options import AvroOptions
from google.cloud.bigquery.format_options import ParquetOptions
from google.cloud.bigquery.job.base import SessionInfo
from google.cloud.bigquery.job import Compression
from google.cloud.bigquery.job import CopyJob
from google.cloud.bigquery.job import CopyJobConfig
from google.cloud.bigquery.job import CreateDisposition
from google.cloud.bigquery.job import DestinationFormat
from google.cloud.bigquery.job import DmlStats
from google.cloud.bigquery.job import Encoding
from google.cloud.bigquery.job import ExtractJob
from google.cloud.bigquery.job import ExtractJobConfig
from google.cloud.bigquery.job import LoadJob
from google.cloud.bigquery.job import LoadJobConfig
from google.cloud.bigquery.job import OperationType
from google.cloud.bigquery.job import QueryJob
from google.cloud.bigquery.job import QueryJobConfig
from google.cloud.bigquery.job import QueryPriority
from google.cloud.bigquery.job import SchemaUpdateOption
from google.cloud.bigquery.job import ScriptOptions
from google.cloud.bigquery.job import SourceFormat
from google.cloud.bigquery.job import UnknownJob
from google.cloud.bigquery.job import TransactionInfo
from google.cloud.bigquery.job import WriteDisposition
from google.cloud.bigquery.model import Model
from google.cloud.bigquery.model import ModelReference
from google.cloud.bigquery.query import ArrayQueryParameter
from google.cloud.bigquery.query import ArrayQueryParameterType
from google.cloud.bigquery.query import ConnectionProperty
from google.cloud.bigquery.query import ScalarQueryParameter
from google.cloud.bigquery.query import ScalarQueryParameterType
from google.cloud.bigquery.query import RangeQueryParameter
from google.cloud.bigquery.query import RangeQueryParameterType
from google.cloud.bigquery.query import SqlParameterScalarTypes
from google.cloud.bigquery.query import StructQueryParameter
from google.cloud.bigquery.query import StructQueryParameterType
from google.cloud.bigquery.query import UDFResource
from google.cloud.bigquery.retry import DEFAULT_RETRY
from google.cloud.bigquery.routine import DeterminismLevel
from google.cloud.bigquery.routine import Routine
from google.cloud.bigquery.routine import RoutineArgument
from google.cloud.bigquery.routine import RoutineReference
from google.cloud.bigquery.routine import RoutineType
from google.cloud.bigquery.routine import RemoteFunctionOptions
from google.cloud.bigquery.schema import PolicyTagList
from google.cloud.bigquery.schema import SchemaField
from google.cloud.bigquery.schema import FieldElementType
from google.cloud.bigquery.standard_sql import StandardSqlDataType
from google.cloud.bigquery.standard_sql import StandardSqlField
from google.cloud.bigquery.standard_sql import StandardSqlStructType
from google.cloud.bigquery.standard_sql import StandardSqlTableType
from google.cloud.bigquery.table import PartitionRange
from google.cloud.bigquery.table import RangePartitioning
from google.cloud.bigquery.table import Row
from google.cloud.bigquery.table import SnapshotDefinition
from google.cloud.bigquery.table import CloneDefinition
from google.cloud.bigquery.table import Table
from google.cloud.bigquery.table import TableReference
from google.cloud.bigquery.table import TimePartitioningType
from google.cloud.bigquery.table import TimePartitioning
from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
from google.cloud.bigquery import _versions_helpers
try:
import bigquery_magics # type: ignore
except ImportError:
bigquery_magics = None
sys_major, sys_minor, sys_micro = _versions_helpers.extract_runtime_version()
if sys_major == 3 and sys_minor in (7, 8):
warnings.warn(
"The python-bigquery library no longer supports Python 3.7 "
"and Python 3.8. "
f"Your Python version is {sys_major}.{sys_minor}.{sys_micro}. We "
"recommend that you update soon to ensure ongoing support. For "
"more details, see: [Google Cloud Client Libraries Supported Python Versions policy](https://cloud.google.com/python/docs/supported-python-versions)",
FutureWarning,
)
__all__ = [
"__version__",
"Client",
# Queries
"ConnectionProperty",
"QueryJob",
"QueryJobConfig",
"ArrayQueryParameter",
"ScalarQueryParameter",
"StructQueryParameter",
"RangeQueryParameter",
"ArrayQueryParameterType",
"ScalarQueryParameterType",
"SqlParameterScalarTypes",
"StructQueryParameterType",
"RangeQueryParameterType",
# Datasets
"Dataset",
"DatasetReference",
"AccessEntry",
# Tables
"Table",
"TableReference",
"PartitionRange",
"RangePartitioning",
"Row",
"SnapshotDefinition",
"CloneDefinition",
"TimePartitioning",
"TimePartitioningType",
# Jobs
"CopyJob",
"CopyJobConfig",
"ExtractJob",
"ExtractJobConfig",
"LoadJob",
"LoadJobConfig",
"SessionInfo",
"UnknownJob",
# Models
"Model",
"ModelReference",
# Routines
"Routine",
"RoutineArgument",
"RoutineReference",
"RemoteFunctionOptions",
# Shared helpers
"SchemaField",
"FieldElementType",
"PolicyTagList",
"UDFResource",
"ExternalConfig",
"AvroOptions",
"BigtableOptions",
"BigtableColumnFamily",
"BigtableColumn",
"DmlStats",
"CSVOptions",
"GoogleSheetsOptions",
"HivePartitioningOptions",
"ParquetOptions",
"ScriptOptions",
"TransactionInfo",
"DEFAULT_RETRY",
# Standard SQL types
"StandardSqlDataType",
"StandardSqlField",
"StandardSqlStructType",
"StandardSqlTableType",
# Enum Constants
"enums",
"AutoRowIDs",
"Compression",
"CreateDisposition",
"DecimalTargetType",
"DestinationFormat",
"DeterminismLevel",
"ExternalSourceFormat",
"Encoding",
"KeyResultStatementKind",
"OperationType",
"QueryPriority",
"RoutineType",
"SchemaUpdateOption",
"SourceFormat",
"SqlTypeNames",
"StandardSqlTypeNames",
"WriteDisposition",
# EncryptionConfiguration
"EncryptionConfiguration",
# Custom exceptions
"LegacyBigQueryStorageError",
"LegacyPyarrowError",
"LegacyPandasError",
]
def load_ipython_extension(ipython):
"""Called by IPython when this module is loaded as an IPython extension."""
warnings.warn(
"%load_ext google.cloud.bigquery is deprecated. Install bigquery-magics package and use `%load_ext bigquery_magics`, instead.",
category=FutureWarning,
)
if bigquery_magics is not None:
bigquery_magics.load_ipython_extension(ipython)
else:
from google.cloud.bigquery.magics.magics import _cell_magic
ipython.register_magic_function(
_cell_magic, magic_kind="cell", magic_name="bigquery"
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,47 @@
# Copyright 2015 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Create / interact with Google BigQuery connections."""
from google.cloud import _http # type: ignore # pytype: disable=import-error
from google.cloud.bigquery import __version__
class Connection(_http.JSONConnection):
"""A connection to Google BigQuery via the JSON REST API.
Args:
client (google.cloud.bigquery.client.Client): The client that owns the current connection.
client_info (Optional[google.api_core.client_info.ClientInfo]): Instance used to generate user agent.
api_endpoint (str): The api_endpoint to use. If None, the library will decide what endpoint to use.
"""
DEFAULT_API_ENDPOINT = "https://bigquery.googleapis.com"
DEFAULT_API_MTLS_ENDPOINT = "https://bigquery.mtls.googleapis.com"
def __init__(self, client, client_info=None, api_endpoint=None):
super(Connection, self).__init__(client, client_info)
self.API_BASE_URL = api_endpoint or self.DEFAULT_API_ENDPOINT
self.API_BASE_MTLS_URL = self.DEFAULT_API_MTLS_ENDPOINT
self.ALLOW_AUTO_SWITCH_TO_MTLS_URL = api_endpoint is None
self._client_info.gapic_version = __version__
self._client_info.client_library_version = __version__
API_VERSION = "v2" # type: ignore
"""The version of the API, used in building the API call's URL."""
API_URL_TEMPLATE = "{api_base_url}/bigquery/{api_version}{path}" # type: ignore
"""A template for the URL of a particular API call."""

View File

@@ -0,0 +1,600 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Helpers for interacting with the job REST APIs from the client.
For queries, there are three cases to consider:
1. jobs.insert: This always returns a job resource.
2. jobs.query, jobCreationMode=JOB_CREATION_REQUIRED:
This sometimes can return the results inline, but always includes a job ID.
3. jobs.query, jobCreationMode=JOB_CREATION_OPTIONAL:
This sometimes doesn't create a job at all, instead returning the results.
For better debugging, an auto-generated query ID is included in the
response.
Client.query() calls either (1) or (2), depending on what the user provides
for the api_method parameter. query() always returns a QueryJob object, which
can retry the query when the query job fails for a retriable reason.
Client.query_and_wait() calls (3). This returns a RowIterator that may wrap
local results from the response or may wrap a query job containing multiple
pages of results. Even though query_and_wait() waits for the job to complete,
we still need a separate job_retry object because there are different
predicates where it is safe to generate a new query ID.
"""
import copy
import functools
import os
import uuid
from typing import Any, Dict, Optional, TYPE_CHECKING, Union
import google.api_core.exceptions as core_exceptions
from google.api_core import retry as retries
from google.cloud.bigquery import job
import google.cloud.bigquery.query
from google.cloud.bigquery import table
import google.cloud.bigquery.retry
from google.cloud.bigquery.retry import POLLING_DEFAULT_VALUE
# Avoid circular imports
if TYPE_CHECKING: # pragma: NO COVER
from google.cloud.bigquery.client import Client
# The purpose of _TIMEOUT_BUFFER_MILLIS is to allow the server-side timeout to
# happen before the client-side timeout. This is not strictly necessary, as the
# client retries client-side timeouts, but the hope by making the server-side
# timeout slightly shorter is that it can save the server from some unncessary
# processing time.
#
# 250 milliseconds is chosen arbitrarily, though should be about the right
# order of magnitude for network latency and switching delays. It is about the
# amount of time for light to circumnavigate the world twice.
_TIMEOUT_BUFFER_MILLIS = 250
def make_job_id(job_id: Optional[str] = None, prefix: Optional[str] = None) -> str:
"""Construct an ID for a new job.
Args:
job_id: the user-provided job ID.
prefix: the user-provided prefix for a job ID.
Returns:
str: A job ID
"""
if job_id is not None:
return job_id
elif prefix is not None:
return str(prefix) + str(uuid.uuid4())
else:
return str(uuid.uuid4())
def job_config_with_defaults(
job_config: Optional[job.QueryJobConfig],
default_job_config: Optional[job.QueryJobConfig],
) -> Optional[job.QueryJobConfig]:
"""Create a copy of `job_config`, replacing unset values with those from
`default_job_config`.
"""
if job_config is None:
return default_job_config
if default_job_config is None:
return job_config
# Both job_config and default_job_config are not None, so make a copy of
# job_config merged with default_job_config. Anything already explicitly
# set on job_config should not be replaced.
return job_config._fill_from_default(default_job_config)
def query_jobs_insert(
client: "Client",
query: str,
job_config: Optional[job.QueryJobConfig],
job_id: Optional[str],
job_id_prefix: Optional[str],
location: Optional[str],
project: str,
retry: Optional[retries.Retry],
timeout: Optional[float],
job_retry: Optional[retries.Retry],
) -> job.QueryJob:
"""Initiate a query using jobs.insert.
See: https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/insert
"""
job_id_given = job_id is not None
job_id_save = job_id
job_config_save = job_config
def do_query():
# Make a copy now, so that original doesn't get changed by the process
# below and to facilitate retry
job_config = copy.deepcopy(job_config_save)
job_id = make_job_id(job_id_save, job_id_prefix)
job_ref = job._JobReference(job_id, project=project, location=location)
query_job = job.QueryJob(job_ref, query, client=client, job_config=job_config)
try:
query_job._begin(retry=retry, timeout=timeout)
except core_exceptions.Conflict as create_exc:
# The thought is if someone is providing their own job IDs and they get
# their job ID generation wrong, this could end up returning results for
# the wrong query. We thus only try to recover if job ID was not given.
if job_id_given:
raise create_exc
try:
# Sometimes we get a 404 after a Conflict. In this case, we
# have pretty high confidence that by retrying the 404, we'll
# (hopefully) eventually recover the job.
# https://github.com/googleapis/python-bigquery/issues/2134
#
# Allow users who want to completely disable retries to
# continue to do so by setting retry to None.
get_job_retry = retry
if retry is not None:
# TODO(tswast): Amend the user's retry object with allowing
# 404 to retry when there's a public way to do so.
# https://github.com/googleapis/python-api-core/issues/796
get_job_retry = (
google.cloud.bigquery.retry._DEFAULT_GET_JOB_CONFLICT_RETRY
)
query_job = client.get_job(
job_id,
project=project,
location=location,
retry=get_job_retry,
timeout=google.cloud.bigquery.retry.DEFAULT_GET_JOB_TIMEOUT,
)
except core_exceptions.GoogleAPIError: # (includes RetryError)
raise
else:
return query_job
else:
return query_job
# Allow users who want to completely disable retries to
# continue to do so by setting job_retry to None.
if job_retry is not None:
do_query = google.cloud.bigquery.retry._DEFAULT_QUERY_JOB_INSERT_RETRY(do_query)
future = do_query()
# The future might be in a failed state now, but if it's
# unrecoverable, we'll find out when we ask for it's result, at which
# point, we may retry.
if not job_id_given:
future._retry_do_query = do_query # in case we have to retry later
future._job_retry = job_retry
return future
def _validate_job_config(request_body: Dict[str, Any], invalid_key: str):
"""Catch common mistakes, such as passing in a *JobConfig object of the
wrong type.
"""
if invalid_key in request_body:
raise ValueError(f"got unexpected key {repr(invalid_key)} in job_config")
def _to_query_request(
job_config: Optional[job.QueryJobConfig] = None,
*,
query: str,
location: Optional[str] = None,
timeout: Optional[float] = None,
) -> Dict[str, Any]:
"""Transform from Job resource to QueryRequest resource.
Most of the keys in job.configuration.query are in common with
QueryRequest. If any configuration property is set that is not available in
jobs.query, it will result in a server-side error.
"""
request_body = copy.copy(job_config.to_api_repr()) if job_config else {}
_validate_job_config(request_body, job.CopyJob._JOB_TYPE)
_validate_job_config(request_body, job.ExtractJob._JOB_TYPE)
_validate_job_config(request_body, job.LoadJob._JOB_TYPE)
# Move query.* properties to top-level.
query_config_resource = request_body.pop("query", {})
request_body.update(query_config_resource)
# Default to standard SQL.
request_body.setdefault("useLegacySql", False)
# Since jobs.query can return results, ensure we use the lossless timestamp
# format. See: https://github.com/googleapis/python-bigquery/issues/395
request_body.setdefault("formatOptions", {})
request_body["formatOptions"]["useInt64Timestamp"] = True # type: ignore
if timeout is not None:
# Subtract a buffer for context switching, network latency, etc.
request_body["timeoutMs"] = max(0, int(1000 * timeout) - _TIMEOUT_BUFFER_MILLIS)
if location is not None:
request_body["location"] = location
request_body["query"] = query
return request_body
def _to_query_job(
client: "Client",
query: str,
request_config: Optional[job.QueryJobConfig],
query_response: Dict[str, Any],
) -> job.QueryJob:
job_ref_resource = query_response["jobReference"]
job_ref = job._JobReference._from_api_repr(job_ref_resource)
query_job = job.QueryJob(job_ref, query, client=client)
query_job._properties.setdefault("configuration", {})
# Not all relevant properties are in the jobs.query response. Populate some
# expected properties based on the job configuration.
if request_config is not None:
query_job._properties["configuration"].update(request_config.to_api_repr())
query_job._properties["configuration"].setdefault("query", {})
query_job._properties["configuration"]["query"]["query"] = query
query_job._properties["configuration"]["query"].setdefault("useLegacySql", False)
query_job._properties.setdefault("statistics", {})
query_job._properties["statistics"].setdefault("query", {})
query_job._properties["statistics"]["query"]["cacheHit"] = query_response.get(
"cacheHit"
)
query_job._properties["statistics"]["query"]["schema"] = query_response.get(
"schema"
)
query_job._properties["statistics"]["query"][
"totalBytesProcessed"
] = query_response.get("totalBytesProcessed")
# Set errors if any were encountered.
query_job._properties.setdefault("status", {})
if "errors" in query_response:
# Set errors but not errorResult. If there was an error that failed
# the job, jobs.query behaves like jobs.getQueryResults and returns a
# non-success HTTP status code.
errors = query_response["errors"]
query_job._properties["status"]["errors"] = errors
# Avoid an extra call to `getQueryResults` if the query has finished.
job_complete = query_response.get("jobComplete")
if job_complete:
query_job._query_results = google.cloud.bigquery.query._QueryResults(
query_response
)
# We want job.result() to refresh the job state, so the conversion is
# always "PENDING", even if the job is finished.
query_job._properties["status"]["state"] = "PENDING"
return query_job
def _to_query_path(project: str) -> str:
return f"/projects/{project}/queries"
def query_jobs_query(
client: "Client",
query: str,
job_config: Optional[job.QueryJobConfig],
location: Optional[str],
project: str,
retry: retries.Retry,
timeout: Optional[float],
job_retry: retries.Retry,
) -> job.QueryJob:
"""Initiate a query using jobs.query with jobCreationMode=JOB_CREATION_REQUIRED.
See: https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query
"""
path = _to_query_path(project)
request_body = _to_query_request(
query=query, job_config=job_config, location=location, timeout=timeout
)
def do_query():
request_body["requestId"] = make_job_id()
span_attributes = {"path": path}
api_response = client._call_api(
retry,
span_name="BigQuery.query",
span_attributes=span_attributes,
method="POST",
path=path,
data=request_body,
timeout=timeout,
)
return _to_query_job(client, query, job_config, api_response)
future = do_query()
# The future might be in a failed state now, but if it's
# unrecoverable, we'll find out when we ask for it's result, at which
# point, we may retry.
future._retry_do_query = do_query # in case we have to retry later
future._job_retry = job_retry
return future
def query_and_wait(
client: "Client",
query: str,
*,
job_config: Optional[job.QueryJobConfig],
location: Optional[str],
project: str,
api_timeout: Optional[float] = None,
wait_timeout: Optional[Union[float, object]] = POLLING_DEFAULT_VALUE,
retry: Optional[retries.Retry],
job_retry: Optional[retries.Retry],
page_size: Optional[int] = None,
max_results: Optional[int] = None,
) -> table.RowIterator:
"""Run the query, wait for it to finish, and return the results.
While ``jobCreationMode=JOB_CREATION_OPTIONAL`` is in preview in the
``jobs.query`` REST API, use the default ``jobCreationMode`` unless
the environment variable ``QUERY_PREVIEW_ENABLED=true``. After
``jobCreationMode`` is GA, this method will always use
``jobCreationMode=JOB_CREATION_OPTIONAL``. See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query
Args:
client:
BigQuery client to make API calls.
query (str):
SQL query to be executed. Defaults to the standard SQL
dialect. Use the ``job_config`` parameter to change dialects.
job_config (Optional[google.cloud.bigquery.job.QueryJobConfig]):
Extra configuration options for the job.
To override any options that were previously set in
the ``default_query_job_config`` given to the
``Client`` constructor, manually set those options to ``None``,
or whatever value is preferred.
location (Optional[str]):
Location where to run the job. Must match the location of the
table used in the query as well as the destination table.
project (Optional[str]):
Project ID of the project of where to run the job. Defaults
to the client's project.
api_timeout (Optional[float]):
The number of seconds to wait for the underlying HTTP transport
before using ``retry``.
wait_timeout (Optional[Union[float, object]]):
The number of seconds to wait for the query to finish. If the
query doesn't finish before this timeout, the client attempts
to cancel the query. If unset, the underlying Client.get_job() API
call has timeout, but we still wait indefinitely for the job to
finish.
retry (Optional[google.api_core.retry.Retry]):
How to retry the RPC. This only applies to making RPC
calls. It isn't used to retry failed jobs. This has
a reasonable default that should only be overridden
with care.
job_retry (Optional[google.api_core.retry.Retry]):
How to retry failed jobs. The default retries
rate-limit-exceeded errors. Passing ``None`` disables
job retry. Not all jobs can be retried.
page_size (Optional[int]):
The maximum number of rows in each page of results from this
request. Non-positive values are ignored.
max_results (Optional[int]):
The maximum total number of rows from this request.
Returns:
google.cloud.bigquery.table.RowIterator:
Iterator of row data
:class:`~google.cloud.bigquery.table.Row`-s. During each
page, the iterator will have the ``total_rows`` attribute
set, which counts the total number of rows **in the result
set** (this is distinct from the total number of rows in the
current page: ``iterator.page.num_items``).
If the query is a special query that produces no results, e.g.
a DDL query, an ``_EmptyRowIterator`` instance is returned.
Raises:
TypeError:
If ``job_config`` is not an instance of
:class:`~google.cloud.bigquery.job.QueryJobConfig`
class.
"""
request_body = _to_query_request(
query=query, job_config=job_config, location=location, timeout=api_timeout
)
# Some API parameters aren't supported by the jobs.query API. In these
# cases, fallback to a jobs.insert call.
if not _supported_by_jobs_query(request_body):
return _wait_or_cancel(
query_jobs_insert(
client=client,
query=query,
job_id=None,
job_id_prefix=None,
job_config=job_config,
location=location,
project=project,
retry=retry,
timeout=api_timeout,
job_retry=job_retry,
),
api_timeout=api_timeout,
wait_timeout=wait_timeout,
retry=retry,
page_size=page_size,
max_results=max_results,
)
path = _to_query_path(project)
if page_size is not None and max_results is not None:
request_body["maxResults"] = min(page_size, max_results)
elif page_size is not None or max_results is not None:
request_body["maxResults"] = page_size or max_results
if os.getenv("QUERY_PREVIEW_ENABLED", "").casefold() == "true":
request_body["jobCreationMode"] = "JOB_CREATION_OPTIONAL"
def do_query():
request_body["requestId"] = make_job_id()
span_attributes = {"path": path}
# For easier testing, handle the retries ourselves.
if retry is not None:
response = retry(client._call_api)(
retry=None, # We're calling the retry decorator ourselves.
span_name="BigQuery.query",
span_attributes=span_attributes,
method="POST",
path=path,
data=request_body,
timeout=api_timeout,
)
else:
response = client._call_api(
retry=None,
span_name="BigQuery.query",
span_attributes=span_attributes,
method="POST",
path=path,
data=request_body,
timeout=api_timeout,
)
# Even if we run with JOB_CREATION_OPTIONAL, if there are more pages
# to fetch, there will be a job ID for jobs.getQueryResults.
query_results = google.cloud.bigquery.query._QueryResults.from_api_repr(
response
)
page_token = query_results.page_token
more_pages = page_token is not None
if more_pages or not query_results.complete:
# TODO(swast): Avoid a call to jobs.get in some cases (few
# remaining pages) by waiting for the query to finish and calling
# client._list_rows_from_query_results directly. Need to update
# RowIterator to fetch destination table via the job ID if needed.
return _wait_or_cancel(
_to_query_job(client, query, job_config, response),
api_timeout=api_timeout,
wait_timeout=wait_timeout,
retry=retry,
page_size=page_size,
max_results=max_results,
)
return table.RowIterator(
client=client,
api_request=functools.partial(client._call_api, retry, timeout=api_timeout),
path=None,
schema=query_results.schema,
max_results=max_results,
page_size=page_size,
total_rows=query_results.total_rows,
first_page_response=response,
location=query_results.location,
job_id=query_results.job_id,
query_id=query_results.query_id,
project=query_results.project,
num_dml_affected_rows=query_results.num_dml_affected_rows,
query=query,
total_bytes_processed=query_results.total_bytes_processed,
)
if job_retry is not None:
return job_retry(do_query)()
else:
return do_query()
def _supported_by_jobs_query(request_body: Dict[str, Any]) -> bool:
"""True if jobs.query can be used. False if jobs.insert is needed."""
request_keys = frozenset(request_body.keys())
# Per issue: https://github.com/googleapis/python-bigquery/issues/1867
# use an allowlist here instead of a denylist because the backend API allows
# unsupported parameters without any warning or failure. Instead, keep this
# set in sync with those in QueryRequest:
# https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#QueryRequest
keys_allowlist = {
"kind",
"query",
"maxResults",
"defaultDataset",
"timeoutMs",
"dryRun",
"preserveNulls",
"useQueryCache",
"useLegacySql",
"parameterMode",
"queryParameters",
"location",
"formatOptions",
"connectionProperties",
"labels",
"maximumBytesBilled",
"requestId",
"createSession",
}
unsupported_keys = request_keys - keys_allowlist
return len(unsupported_keys) == 0
def _wait_or_cancel(
job: job.QueryJob,
api_timeout: Optional[float],
wait_timeout: Optional[Union[object, float]],
retry: Optional[retries.Retry],
page_size: Optional[int],
max_results: Optional[int],
) -> table.RowIterator:
"""Wait for a job to complete and return the results.
If we can't return the results within the ``wait_timeout``, try to cancel
the job.
"""
try:
return job.result(
page_size=page_size,
max_results=max_results,
retry=retry,
timeout=wait_timeout,
)
except Exception:
# Attempt to cancel the job since we can't return the results.
try:
job.cancel(retry=retry, timeout=api_timeout)
except Exception:
# Don't eat the original exception if cancel fails.
pass
raise

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,147 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Shared helper functions for connecting BigQuery and pyarrow.
NOTE: This module is DEPRECATED. Please make updates in the pandas-gbq package,
instead. See: go/pandas-gbq-and-bigframes-redundancy,
https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/bigquery_to_pyarrow.py
and
https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/pyarrow_to_bigquery.py
"""
from typing import Any
try:
import pyarrow # type: ignore
except ImportError:
pyarrow = None
try:
import db_dtypes # type: ignore
db_dtypes_import_exception = None
except ImportError as exc:
db_dtypes = None
db_dtypes_import_exception = exc
def pyarrow_datetime():
return pyarrow.timestamp("us", tz=None)
def pyarrow_numeric():
return pyarrow.decimal128(38, 9)
def pyarrow_bignumeric():
# 77th digit is partial.
# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types
return pyarrow.decimal256(76, 38)
def pyarrow_time():
return pyarrow.time64("us")
def pyarrow_timestamp():
return pyarrow.timestamp("us", tz="UTC")
_BQ_TO_ARROW_SCALARS = {}
_ARROW_SCALAR_IDS_TO_BQ = {}
if pyarrow:
# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py
# When modifying it be sure to update it there as well.
# Note(todo!!): type "BIGNUMERIC"'s matching pyarrow type is added in _pandas_helpers.py
_BQ_TO_ARROW_SCALARS = {
"BOOL": pyarrow.bool_,
"BOOLEAN": pyarrow.bool_,
"BYTES": pyarrow.binary,
"DATE": pyarrow.date32,
"DATETIME": pyarrow_datetime,
"FLOAT": pyarrow.float64,
"FLOAT64": pyarrow.float64,
"GEOGRAPHY": pyarrow.string,
"INT64": pyarrow.int64,
"INTEGER": pyarrow.int64,
# Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0),
# but we'd like this to map as closely to the BQ Storage API as
# possible, which uses the string() dtype, as JSON support in Arrow
# predates JSON support in BigQuery by several years.
"JSON": pyarrow.string,
"NUMERIC": pyarrow_numeric,
"STRING": pyarrow.string,
"TIME": pyarrow_time,
"TIMESTAMP": pyarrow_timestamp,
}
# DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead.
_ARROW_SCALAR_IDS_TO_BQ = {
# https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
pyarrow.bool_().id: "BOOL",
pyarrow.int8().id: "INT64",
pyarrow.int16().id: "INT64",
pyarrow.int32().id: "INT64",
pyarrow.int64().id: "INT64",
pyarrow.uint8().id: "INT64",
pyarrow.uint16().id: "INT64",
pyarrow.uint32().id: "INT64",
pyarrow.uint64().id: "INT64",
pyarrow.float16().id: "FLOAT64",
pyarrow.float32().id: "FLOAT64",
pyarrow.float64().id: "FLOAT64",
pyarrow.time32("ms").id: "TIME",
pyarrow.time64("ns").id: "TIME",
pyarrow.timestamp("ns").id: "TIMESTAMP",
pyarrow.date32().id: "DATE",
pyarrow.date64().id: "DATETIME", # because millisecond resolution
pyarrow.binary().id: "BYTES",
pyarrow.string().id: "STRING", # also alias for pyarrow.utf8()
pyarrow.large_string().id: "STRING",
# The exact scale and precision don't matter, see below.
pyarrow.decimal128(38, scale=9).id: "NUMERIC",
# NOTE: all extension types (e.g. json_, uuid, db_dtypes.JSONArrowType)
# have the same id (31 as of version 19.0.1), so these should not be
# matched by id.
}
_BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric
# The exact decimal's scale and precision are not important, as only
# the type ID matters, and it's the same for all decimal256 instances.
_ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC"
def bq_to_arrow_scalars(bq_scalar: str):
"""
DEPRECATED: update pandas_gbq.schema.bigquery_to_pyarrow, instead, which is
to be added in https://github.com/googleapis/python-bigquery-pandas/pull/893.
Returns:
The Arrow scalar type that the input BigQuery scalar type maps to.
If it cannot find the BigQuery scalar, return None.
"""
return _BQ_TO_ARROW_SCALARS.get(bq_scalar)
def arrow_scalar_ids_to_bq(arrow_scalar: Any):
"""
DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead.
Returns:
The BigQuery scalar type that the input arrow scalar type maps to.
If it cannot find the arrow scalar, return None.
"""
return _ARROW_SCALAR_IDS_TO_BQ.get(arrow_scalar)

View File

@@ -0,0 +1,137 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Shared helper functions for tqdm progress bar."""
import concurrent.futures
import sys
import time
import typing
from typing import Optional
import warnings
try:
import tqdm # type: ignore
except ImportError:
tqdm = None
try:
import tqdm.notebook as tqdm_notebook # type: ignore
except ImportError:
tqdm_notebook = None
if typing.TYPE_CHECKING: # pragma: NO COVER
from google.cloud.bigquery import QueryJob
from google.cloud.bigquery.table import RowIterator
_NO_TQDM_ERROR = (
"A progress bar was requested, but there was an error loading the tqdm "
"library. Please install tqdm to use the progress bar functionality."
)
_PROGRESS_BAR_UPDATE_INTERVAL = 0.5
def get_progress_bar(progress_bar_type, description, total, unit):
"""Construct a tqdm progress bar object, if tqdm is installed."""
if tqdm is None or tqdm_notebook is None and progress_bar_type == "tqdm_notebook":
if progress_bar_type is not None:
warnings.warn(_NO_TQDM_ERROR, UserWarning, stacklevel=3)
return None
try:
if progress_bar_type == "tqdm":
return tqdm.tqdm(
bar_format="{l_bar}{bar}|",
colour="green",
desc=description,
file=sys.stdout,
total=total,
unit=unit,
)
elif progress_bar_type == "tqdm_notebook":
return tqdm_notebook.tqdm(
bar_format="{l_bar}{bar}|",
desc=description,
file=sys.stdout,
total=total,
unit=unit,
)
elif progress_bar_type == "tqdm_gui":
return tqdm.tqdm_gui(desc=description, total=total, unit=unit)
except (KeyError, TypeError): # pragma: NO COVER
# Protect ourselves from any tqdm errors. In case of
# unexpected tqdm behavior, just fall back to showing
# no progress bar.
warnings.warn(_NO_TQDM_ERROR, UserWarning, stacklevel=3)
return None
def wait_for_query(
query_job: "QueryJob",
progress_bar_type: Optional[str] = None,
max_results: Optional[int] = None,
) -> "RowIterator":
"""Return query result and display a progress bar while the query running, if tqdm is installed.
Args:
query_job:
The job representing the execution of the query on the server.
progress_bar_type:
The type of progress bar to use to show query progress.
max_results:
The maximum number of rows the row iterator should return.
Returns:
A row iterator over the query results.
"""
default_total = 1
current_stage = None
start_time = time.perf_counter()
progress_bar = get_progress_bar(
progress_bar_type, "Query is running", default_total, "query"
)
if progress_bar is None:
return query_job.result(max_results=max_results)
i = 0
while True:
if query_job.query_plan:
default_total = len(query_job.query_plan)
current_stage = query_job.query_plan[i]
progress_bar.total = len(query_job.query_plan)
progress_bar.set_description(
f"Query executing stage {current_stage.name} and status {current_stage.status} : {time.perf_counter() - start_time:.2f}s"
)
try:
query_result = query_job.result(
timeout=_PROGRESS_BAR_UPDATE_INTERVAL, max_results=max_results
)
progress_bar.update(default_total)
progress_bar.set_description(
f"Job ID {query_job.job_id} successfully executed",
)
break
except concurrent.futures.TimeoutError:
query_job.reload() # Refreshes the state via a GET request.
if current_stage:
if current_stage.status == "COMPLETE":
if i < default_total - 1:
progress_bar.update(i + 1)
i += 1
continue
progress_bar.close()
return query_result

View File

@@ -0,0 +1,264 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Shared helper functions for verifying versions of installed modules."""
import sys
from typing import Any
import packaging.version
from google.cloud.bigquery import exceptions
_MIN_PYARROW_VERSION = packaging.version.Version("3.0.0")
_MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0")
_BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0")
_MIN_PANDAS_VERSION = packaging.version.Version("1.1.0")
_MIN_PANDAS_VERSION_RANGE = packaging.version.Version("1.5.0")
_MIN_PYARROW_VERSION_RANGE = packaging.version.Version("10.0.1")
class PyarrowVersions:
"""Version comparisons for pyarrow package."""
def __init__(self):
self._installed_version = None
@property
def installed_version(self) -> packaging.version.Version:
"""Return the parsed version of pyarrow."""
if self._installed_version is None:
import pyarrow # type: ignore
self._installed_version = packaging.version.parse(
# Use 0.0.0, since it is earlier than any released version.
# Legacy versions also have the same property, but
# creating a LegacyVersion has been deprecated.
# https://github.com/pypa/packaging/issues/321
getattr(pyarrow, "__version__", "0.0.0")
)
return self._installed_version
@property
def use_compliant_nested_type(self) -> bool:
return self.installed_version.major >= 4
def try_import(self, raise_if_error: bool = False) -> Any:
"""Verifies that a recent enough version of pyarrow extra is installed.
The function assumes that pyarrow extra is installed, and should thus
be used in places where this assumption holds.
Because `pip` can install an outdated version of this extra despite
the constraints in `setup.py`, the calling code can use this helper
to verify the version compatibility at runtime.
Returns:
The ``pyarrow`` module or ``None``.
Raises:
exceptions.LegacyPyarrowError:
If the pyarrow package is outdated and ``raise_if_error`` is
``True``.
"""
try:
import pyarrow
except ImportError as exc:
if raise_if_error:
raise exceptions.LegacyPyarrowError(
"pyarrow package not found. Install pyarrow version >="
f" {_MIN_PYARROW_VERSION}."
) from exc
return None
if self.installed_version < _MIN_PYARROW_VERSION:
if raise_if_error:
msg = (
"Dependency pyarrow is outdated, please upgrade"
f" it to version >= {_MIN_PYARROW_VERSION}"
f" (version found: {self.installed_version})."
)
raise exceptions.LegacyPyarrowError(msg)
return None
return pyarrow
PYARROW_VERSIONS = PyarrowVersions()
class BQStorageVersions:
"""Version comparisons for google-cloud-bigqueyr-storage package."""
def __init__(self):
self._installed_version = None
@property
def installed_version(self) -> packaging.version.Version:
"""Return the parsed version of google-cloud-bigquery-storage."""
if self._installed_version is None:
from google.cloud import bigquery_storage
self._installed_version = packaging.version.parse(
# Use 0.0.0, since it is earlier than any released version.
# Legacy versions also have the same property, but
# creating a LegacyVersion has been deprecated.
# https://github.com/pypa/packaging/issues/321
getattr(bigquery_storage, "__version__", "0.0.0")
)
return self._installed_version # type: ignore
@property
def is_read_session_optional(self) -> bool:
"""True if read_session is optional to rows().
See: https://github.com/googleapis/python-bigquery-storage/pull/228
"""
return self.installed_version >= _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION
def try_import(self, raise_if_error: bool = False) -> Any:
"""Tries to import the bigquery_storage module, and returns results
accordingly. It also verifies the module version is recent enough.
If the import succeeds, returns the ``bigquery_storage`` module.
If the import fails,
returns ``None`` when ``raise_if_error == False``,
raises Error when ``raise_if_error == True``.
Returns:
The ``bigquery_storage`` module or ``None``.
Raises:
exceptions.BigQueryStorageNotFoundError:
If google-cloud-bigquery-storage is not installed
exceptions.LegacyBigQueryStorageError:
If google-cloud-bigquery-storage package is outdated
"""
try:
from google.cloud import bigquery_storage # type: ignore
except ImportError:
if raise_if_error:
msg = (
"Package google-cloud-bigquery-storage not found. "
"Install google-cloud-bigquery-storage version >= "
f"{_MIN_BQ_STORAGE_VERSION}."
)
raise exceptions.BigQueryStorageNotFoundError(msg)
return None
if self.installed_version < _MIN_BQ_STORAGE_VERSION:
if raise_if_error:
msg = (
"Dependency google-cloud-bigquery-storage is outdated, "
f"please upgrade it to version >= {_MIN_BQ_STORAGE_VERSION} "
f"(version found: {self.installed_version})."
)
raise exceptions.LegacyBigQueryStorageError(msg)
return None
return bigquery_storage
BQ_STORAGE_VERSIONS = BQStorageVersions()
class PandasVersions:
"""Version comparisons for pandas package."""
def __init__(self):
self._installed_version = None
@property
def installed_version(self) -> packaging.version.Version:
"""Return the parsed version of pandas"""
if self._installed_version is None:
import pandas # type: ignore
self._installed_version = packaging.version.parse(
# Use 0.0.0, since it is earlier than any released version.
# Legacy versions also have the same property, but
# creating a LegacyVersion has been deprecated.
# https://github.com/pypa/packaging/issues/321
getattr(pandas, "__version__", "0.0.0")
)
return self._installed_version
def try_import(self, raise_if_error: bool = False) -> Any:
"""Verify that a recent enough version of pandas extra is installed.
The function assumes that pandas extra is installed, and should thus
be used in places where this assumption holds.
Because `pip` can install an outdated version of this extra despite
the constraints in `setup.py`, the calling code can use this helper
to verify the version compatibility at runtime.
Returns:
The ``pandas`` module or ``None``.
Raises:
exceptions.LegacyPandasError:
If the pandas package is outdated and ``raise_if_error`` is
``True``.
"""
try:
import pandas
except ImportError as exc:
if raise_if_error:
raise exceptions.LegacyPandasError(
"pandas package not found. Install pandas version >="
f" {_MIN_PANDAS_VERSION}"
) from exc
return None
if self.installed_version < _MIN_PANDAS_VERSION:
if raise_if_error:
msg = (
"Dependency pandas is outdated, please upgrade"
f" it to version >= {_MIN_PANDAS_VERSION}"
f" (version found: {self.installed_version})."
)
raise exceptions.LegacyPandasError(msg)
return None
return pandas
PANDAS_VERSIONS = PandasVersions()
# Since RANGE support in pandas requires specific versions
# of both pyarrow and pandas, we make this a separate
# constant instead of as a property of PANDAS_VERSIONS
# or PYARROW_VERSIONS.
SUPPORTS_RANGE_PYARROW = (
PANDAS_VERSIONS.try_import() is not None
and PANDAS_VERSIONS.installed_version >= _MIN_PANDAS_VERSION_RANGE
and PYARROW_VERSIONS.try_import() is not None
and PYARROW_VERSIONS.installed_version >= _MIN_PYARROW_VERSION_RANGE
)
def extract_runtime_version():
# Retrieve the version information
version_info = sys.version_info
# Extract the major, minor, and micro components
major = version_info.major
minor = version_info.minor
micro = version_info.micro
# Display the version number in a clear format
return major, minor, micro

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,87 @@
# Copyright 2017 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Google BigQuery implementation of the Database API Specification v2.0.
This module implements the `Python Database API Specification v2.0 (DB-API)`_
for Google BigQuery.
.. _Python Database API Specification v2.0 (DB-API):
https://www.python.org/dev/peps/pep-0249/
"""
from google.cloud.bigquery.dbapi.connection import connect
from google.cloud.bigquery.dbapi.connection import Connection
from google.cloud.bigquery.dbapi.cursor import Cursor
from google.cloud.bigquery.dbapi.exceptions import Warning
from google.cloud.bigquery.dbapi.exceptions import Error
from google.cloud.bigquery.dbapi.exceptions import InterfaceError
from google.cloud.bigquery.dbapi.exceptions import DatabaseError
from google.cloud.bigquery.dbapi.exceptions import DataError
from google.cloud.bigquery.dbapi.exceptions import OperationalError
from google.cloud.bigquery.dbapi.exceptions import IntegrityError
from google.cloud.bigquery.dbapi.exceptions import InternalError
from google.cloud.bigquery.dbapi.exceptions import ProgrammingError
from google.cloud.bigquery.dbapi.exceptions import NotSupportedError
from google.cloud.bigquery.dbapi.types import Binary
from google.cloud.bigquery.dbapi.types import Date
from google.cloud.bigquery.dbapi.types import DateFromTicks
from google.cloud.bigquery.dbapi.types import Time
from google.cloud.bigquery.dbapi.types import TimeFromTicks
from google.cloud.bigquery.dbapi.types import Timestamp
from google.cloud.bigquery.dbapi.types import TimestampFromTicks
from google.cloud.bigquery.dbapi.types import BINARY
from google.cloud.bigquery.dbapi.types import DATETIME
from google.cloud.bigquery.dbapi.types import NUMBER
from google.cloud.bigquery.dbapi.types import ROWID
from google.cloud.bigquery.dbapi.types import STRING
apilevel = "2.0"
# Threads may share the module and connections, but not cursors.
threadsafety = 2
paramstyle = "pyformat"
__all__ = [
"apilevel",
"threadsafety",
"paramstyle",
"connect",
"Connection",
"Cursor",
"Warning",
"Error",
"InterfaceError",
"DatabaseError",
"DataError",
"OperationalError",
"IntegrityError",
"InternalError",
"ProgrammingError",
"NotSupportedError",
"Binary",
"Date",
"DateFromTicks",
"Time",
"TimeFromTicks",
"Timestamp",
"TimestampFromTicks",
"BINARY",
"DATETIME",
"NUMBER",
"ROWID",
"STRING",
]

View File

@@ -0,0 +1,522 @@
# Copyright 2017 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import abc as collections_abc
import datetime
import decimal
import functools
import numbers
import re
import typing
from google.cloud import bigquery
from google.cloud.bigquery import table, query
from google.cloud.bigquery.dbapi import exceptions
_NUMERIC_SERVER_MIN = decimal.Decimal("-9.9999999999999999999999999999999999999E+28")
_NUMERIC_SERVER_MAX = decimal.Decimal("9.9999999999999999999999999999999999999E+28")
type_parameters_re = re.compile(
r"""
\(
\s*[0-9]+\s*
(,
\s*[0-9]+\s*
)*
\)
""",
re.VERBOSE,
)
def _parameter_type(name, value, query_parameter_type=None, value_doc=""):
if query_parameter_type:
# Strip type parameters
query_parameter_type = type_parameters_re.sub("", query_parameter_type)
try:
parameter_type = getattr(
query.SqlParameterScalarTypes, query_parameter_type.upper()
)._type
except AttributeError:
raise exceptions.ProgrammingError(
f"The given parameter type, {query_parameter_type},"
f" for {name} is not a valid BigQuery scalar type."
)
else:
parameter_type = bigquery_scalar_type(value)
if parameter_type is None:
raise exceptions.ProgrammingError(
f"Encountered parameter {name} with "
f"{value_doc} value {value} of unexpected type."
)
return parameter_type
def scalar_to_query_parameter(value, name=None, query_parameter_type=None):
"""Convert a scalar value into a query parameter.
Args:
value (Any):
A scalar value to convert into a query parameter.
name (str):
(Optional) Name of the query parameter.
query_parameter_type (Optional[str]): Given type for the parameter.
Returns:
google.cloud.bigquery.ScalarQueryParameter:
A query parameter corresponding with the type and value of the plain
Python object.
Raises:
google.cloud.bigquery.dbapi.exceptions.ProgrammingError:
if the type cannot be determined.
"""
return bigquery.ScalarQueryParameter(
name, _parameter_type(name, value, query_parameter_type), value
)
def array_to_query_parameter(value, name=None, query_parameter_type=None):
"""Convert an array-like value into a query parameter.
Args:
value (Sequence[Any]): The elements of the array (should not be a
string-like Sequence).
name (Optional[str]): Name of the query parameter.
query_parameter_type (Optional[str]): Given type for the parameter.
Returns:
A query parameter corresponding with the type and value of the plain
Python object.
Raises:
google.cloud.bigquery.dbapi.exceptions.ProgrammingError:
if the type of array elements cannot be determined.
"""
if not array_like(value):
raise exceptions.ProgrammingError(
"The value of parameter {} must be a sequence that is "
"not string-like.".format(name)
)
if query_parameter_type or value:
array_type = _parameter_type(
name,
value[0] if value else None,
query_parameter_type,
value_doc="array element ",
)
else:
raise exceptions.ProgrammingError(
"Encountered an empty array-like value of parameter {}, cannot "
"determine array elements type.".format(name)
)
return bigquery.ArrayQueryParameter(name, array_type, value)
def _parse_struct_fields(
fields,
base,
parse_struct_field=re.compile(
r"""
(?:(\w+)\s+) # field name
([A-Z0-9<> ,()]+) # Field type
$""",
re.VERBOSE | re.IGNORECASE,
).match,
):
# Split a string of struct fields. They're defined by commas, but
# we have to avoid splitting on commas internal to fields. For
# example:
# name string, children array<struct<name string, bdate date>>
#
# only has 2 top-level fields.
fields = fields.split(",")
fields = list(reversed(fields)) # in the off chance that there are very many
while fields:
field = fields.pop()
while fields and field.count("<") != field.count(">"):
field += "," + fields.pop()
m = parse_struct_field(field.strip())
if not m:
raise exceptions.ProgrammingError(
f"Invalid struct field, {field}, in {base}"
)
yield m.group(1, 2)
SCALAR, ARRAY, STRUCT = ("s", "a", "r")
def _parse_type(
type_,
name,
base,
complex_query_parameter_parse=re.compile(
r"""
\s*
(ARRAY|STRUCT|RECORD) # Type
\s*
<([A-Z0-9_<> ,()]+)> # Subtype(s)
\s*$
""",
re.IGNORECASE | re.VERBOSE,
).match,
):
if "<" not in type_:
# Scalar
# Strip type parameters
type_ = type_parameters_re.sub("", type_).strip()
try:
type_ = getattr(query.SqlParameterScalarTypes, type_.upper())
except AttributeError:
raise exceptions.ProgrammingError(
f"The given parameter type, {type_},"
f"{' for ' + name if name else ''}"
f" is not a valid BigQuery scalar type, in {base}."
)
if name:
type_ = type_.with_name(name)
return SCALAR, type_
m = complex_query_parameter_parse(type_)
if not m:
raise exceptions.ProgrammingError(f"Invalid parameter type, {type_}")
tname, sub = m.group(1, 2)
if tname.upper() == "ARRAY":
sub_type = complex_query_parameter_type(None, sub, base)
if isinstance(sub_type, query.ArrayQueryParameterType):
raise exceptions.ProgrammingError(f"Array can't contain an array in {base}")
sub_type._complex__src = sub
return ARRAY, sub_type
else:
return STRUCT, _parse_struct_fields(sub, base)
def complex_query_parameter_type(name: typing.Optional[str], type_: str, base: str):
"""Construct a parameter type (`StructQueryParameterType`) for a complex type
or a non-complex type that's part of a complex type.
Examples:
array<struct<x float64, y float64>>
struct<name string, children array<struct<name string, bdate date>>>
This is used for computing array types.
"""
type_type, sub_type = _parse_type(type_, name, base)
if type_type == SCALAR:
result_type = sub_type
elif type_type == ARRAY:
result_type = query.ArrayQueryParameterType(sub_type, name=name)
elif type_type == STRUCT:
fields = [
complex_query_parameter_type(field_name, field_type, base)
for field_name, field_type in sub_type
]
result_type = query.StructQueryParameterType(*fields, name=name)
else: # pragma: NO COVER
raise AssertionError("Bad type_type", type_type) # Can't happen :)
return result_type
def complex_query_parameter(
name: typing.Optional[str], value, type_: str, base: typing.Optional[str] = None
):
"""
Construct a query parameter for a complex type (array or struct record)
or for a subtype, which may not be complex
Examples:
array<struct<x float64, y float64>>
struct<name string, children array<struct<name string, bdate date>>>
"""
param: typing.Union[
query.ScalarQueryParameter,
query.ArrayQueryParameter,
query.StructQueryParameter,
]
base = base or type_
type_type, sub_type = _parse_type(type_, name, base)
if type_type == SCALAR:
param = query.ScalarQueryParameter(name, sub_type._type, value)
elif type_type == ARRAY:
if not array_like(value):
raise exceptions.ProgrammingError(
f"Array type with non-array-like value"
f" with type {type(value).__name__}"
)
param = query.ArrayQueryParameter(
name,
sub_type,
(
value
if isinstance(sub_type, query.ScalarQueryParameterType)
else [
complex_query_parameter(None, v, sub_type._complex__src, base)
for v in value
]
),
)
elif type_type == STRUCT:
if not isinstance(value, collections_abc.Mapping):
raise exceptions.ProgrammingError(f"Non-mapping value for type {type_}")
value_keys = set(value)
fields = []
for field_name, field_type in sub_type:
if field_name not in value:
raise exceptions.ProgrammingError(
f"No field value for {field_name} in {type_}"
)
value_keys.remove(field_name)
fields.append(
complex_query_parameter(field_name, value[field_name], field_type, base)
)
if value_keys:
raise exceptions.ProgrammingError(f"Extra data keys for {type_}")
param = query.StructQueryParameter(name, *fields)
else: # pragma: NO COVER
raise AssertionError("Bad type_type", type_type) # Can't happen :)
return param
def _dispatch_parameter(type_, value, name=None):
if type_ is not None and "<" in type_:
param = complex_query_parameter(name, value, type_)
elif isinstance(value, collections_abc.Mapping):
raise NotImplementedError(
f"STRUCT-like parameter values are not supported"
f"{' (parameter ' + name + ')' if name else ''},"
f" unless an explicit type is give in the parameter placeholder"
f" (e.g. '%({name if name else ''}:struct<...>)s')."
)
elif array_like(value):
param = array_to_query_parameter(value, name, type_)
else:
param = scalar_to_query_parameter(value, name, type_)
return param
def to_query_parameters_list(parameters, parameter_types):
"""Converts a sequence of parameter values into query parameters.
Args:
parameters (Sequence[Any]): Sequence of query parameter values.
parameter_types:
A list of parameter types, one for each parameter.
Unknown types are provided as None.
Returns:
List[google.cloud.bigquery.query._AbstractQueryParameter]:
A list of query parameters.
"""
return [
_dispatch_parameter(type_, value)
for value, type_ in zip(parameters, parameter_types)
]
def to_query_parameters_dict(parameters, query_parameter_types):
"""Converts a dictionary of parameter values into query parameters.
Args:
parameters (Mapping[str, Any]): Dictionary of query parameter values.
parameter_types:
A dictionary of parameter types. It needn't have a key for each
parameter.
Returns:
List[google.cloud.bigquery.query._AbstractQueryParameter]:
A list of named query parameters.
"""
return [
_dispatch_parameter(query_parameter_types.get(name), value, name)
for name, value in parameters.items()
]
def to_query_parameters(parameters, parameter_types):
"""Converts DB-API parameter values into query parameters.
Args:
parameters (Union[Mapping[str, Any], Sequence[Any]]):
A dictionary or sequence of query parameter values.
parameter_types (Union[Mapping[str, str], Sequence[str]]):
A dictionary or list of parameter types.
If parameters is a mapping, then this must be a dictionary
of parameter types. It needn't have a key for each
parameter.
If parameters is a sequence, then this must be a list of
parameter types, one for each paramater. Unknown types
are provided as None.
Returns:
List[google.cloud.bigquery.query._AbstractQueryParameter]:
A list of query parameters.
"""
if parameters is None:
return []
if isinstance(parameters, collections_abc.Mapping):
return to_query_parameters_dict(parameters, parameter_types)
else:
return to_query_parameters_list(parameters, parameter_types)
def bigquery_scalar_type(value):
"""Return a BigQuery name of the scalar type that matches the given value.
If the scalar type name could not be determined (e.g. for non-scalar
values), ``None`` is returned.
Args:
value (Any)
Returns:
Optional[str]: The BigQuery scalar type name.
"""
if isinstance(value, bool):
return "BOOL"
elif isinstance(value, numbers.Integral):
return "INT64"
elif isinstance(value, numbers.Real):
return "FLOAT64"
elif isinstance(value, decimal.Decimal):
vtuple = value.as_tuple()
# NUMERIC values have precision of 38 (number of digits) and scale of 9 (number
# of fractional digits), and their max absolute value must be strictly smaller
# than 1.0E+29.
# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types
if (
len(vtuple.digits) <= 38 # max precision: 38
and vtuple.exponent >= -9 # max scale: 9
and _NUMERIC_SERVER_MIN <= value <= _NUMERIC_SERVER_MAX
):
return "NUMERIC"
else:
return "BIGNUMERIC"
elif isinstance(value, str):
return "STRING"
elif isinstance(value, bytes):
return "BYTES"
elif isinstance(value, datetime.datetime):
return "DATETIME" if value.tzinfo is None else "TIMESTAMP"
elif isinstance(value, datetime.date):
return "DATE"
elif isinstance(value, datetime.time):
return "TIME"
return None
def array_like(value):
"""Determine if the given value is array-like.
Examples of array-like values (as interpreted by this function) are
sequences such as ``list`` and ``tuple``, but not strings and other
iterables such as sets.
Args:
value (Any)
Returns:
bool: ``True`` if the value is considered array-like, ``False`` otherwise.
"""
return isinstance(value, collections_abc.Sequence) and not isinstance(
value, (str, bytes, bytearray)
)
def to_bq_table_rows(rows_iterable):
"""Convert table rows to BigQuery table Row instances.
Args:
rows_iterable (Iterable[Mapping]):
An iterable of row data items to convert to ``Row`` instances.
Returns:
Iterable[google.cloud.bigquery.table.Row]
"""
def to_table_row(row):
# NOTE: We fetch ARROW values, thus we need to convert them to Python
# objects with as_py().
values = tuple(value.as_py() for value in row.values())
keys_to_index = {key: i for i, key in enumerate(row.keys())}
return table.Row(values, keys_to_index)
return (to_table_row(row_data) for row_data in rows_iterable)
def raise_on_closed(
exc_msg, exc_class=exceptions.ProgrammingError, closed_attr_name="_closed"
):
"""Make public instance methods raise an error if the instance is closed."""
def _raise_on_closed(method):
"""Make a non-static method raise an error if its containing instance is closed."""
def with_closed_check(self, *args, **kwargs):
if getattr(self, closed_attr_name):
raise exc_class(exc_msg)
return method(self, *args, **kwargs)
functools.update_wrapper(with_closed_check, method)
return with_closed_check
def decorate_public_methods(klass):
"""Apply ``_raise_on_closed()`` decorator to public instance methods."""
for name in dir(klass):
if name.startswith("_") and name != "__iter__":
continue
member = getattr(klass, name)
if not callable(member):
continue
# We need to check for class/static methods directly in the instance
# __dict__, not via the retrieved attribute (`member`), as the
# latter is already a callable *produced* by one of these descriptors.
if isinstance(klass.__dict__[name], (staticmethod, classmethod)):
continue
member = _raise_on_closed(member)
setattr(klass, name, member)
return klass
return decorate_public_methods

View File

@@ -0,0 +1,128 @@
# Copyright 2017 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Connection for the Google BigQuery DB-API."""
import weakref
from google.cloud import bigquery
from google.cloud.bigquery.dbapi import cursor
from google.cloud.bigquery.dbapi import _helpers
@_helpers.raise_on_closed("Operating on a closed connection.")
class Connection(object):
"""DB-API Connection to Google BigQuery.
Args:
client (Optional[google.cloud.bigquery.Client]):
A REST API client used to connect to BigQuery. If not passed, a
client is created using default options inferred from the environment.
bqstorage_client(\
Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient] \
):
A client that uses the faster BigQuery Storage API to fetch rows from
BigQuery. If not passed, it is created using the same credentials
as ``client`` (provided that BigQuery Storage dependencies are installed).
prefer_bqstorage_client (Optional[bool]):
Prefer the BigQuery Storage client over the REST client. If Storage
client isn't available, fall back to the REST client. Defaults to
``True``.
"""
def __init__(
self,
client=None,
bqstorage_client=None,
prefer_bqstorage_client=True,
):
if client is None:
client = bigquery.Client()
self._owns_client = True
else:
self._owns_client = False
# A warning is already raised by the BQ Storage client factory factory if
# instantiation fails, or if the given BQ Storage client instance is outdated.
if not prefer_bqstorage_client:
bqstorage_client = None
self._owns_bqstorage_client = False
elif bqstorage_client is None:
bqstorage_client = client._ensure_bqstorage_client()
self._owns_bqstorage_client = bqstorage_client is not None
else:
self._owns_bqstorage_client = False
bqstorage_client = client._ensure_bqstorage_client(bqstorage_client)
self._client = client
self._bqstorage_client = bqstorage_client
self._closed = False
self._cursors_created = weakref.WeakSet()
def close(self):
"""Close the connection and any cursors created from it.
Any BigQuery clients explicitly passed to the constructor are *not*
closed, only those created by the connection instance itself.
"""
self._closed = True
if self._owns_client:
self._client.close()
if self._owns_bqstorage_client:
# There is no close() on the BQ Storage client itself.
self._bqstorage_client._transport.grpc_channel.close()
for cursor_ in self._cursors_created:
if not cursor_._closed:
cursor_.close()
def commit(self):
"""No-op, but for consistency raise an error if connection is closed."""
def cursor(self):
"""Return a new cursor object.
Returns:
google.cloud.bigquery.dbapi.Cursor: A DB-API cursor that uses this connection.
"""
new_cursor = cursor.Cursor(self)
self._cursors_created.add(new_cursor)
return new_cursor
def connect(client=None, bqstorage_client=None, prefer_bqstorage_client=True):
"""Construct a DB-API connection to Google BigQuery.
Args:
client (Optional[google.cloud.bigquery.Client]):
A REST API client used to connect to BigQuery. If not passed, a
client is created using default options inferred from the environment.
bqstorage_client(\
Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient] \
):
A client that uses the faster BigQuery Storage API to fetch rows from
BigQuery. If not passed, it is created using the same credentials
as ``client`` (provided that BigQuery Storage dependencies are installed).
prefer_bqstorage_client (Optional[bool]):
Prefer the BigQuery Storage client over the REST client. If Storage
client isn't available, fall back to the REST client. Defaults to
``True``.
Returns:
google.cloud.bigquery.dbapi.Connection: A new DB-API connection to BigQuery.
"""
return Connection(client, bqstorage_client, prefer_bqstorage_client)

View File

@@ -0,0 +1,586 @@
# Copyright 2017 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Cursor for the Google BigQuery DB-API."""
from __future__ import annotations
import collections
from collections import abc as collections_abc
import re
from typing import Optional
try:
from google.cloud.bigquery_storage import ArrowSerializationOptions
except ImportError:
_ARROW_COMPRESSION_SUPPORT = False
else:
# Having BQ Storage available implies that pyarrow >=1.0.0 is available, too.
_ARROW_COMPRESSION_SUPPORT = True
from google.cloud.bigquery import job
from google.cloud.bigquery.dbapi import _helpers
from google.cloud.bigquery.dbapi import exceptions
import google.cloud.exceptions # type: ignore
# Per PEP 249: A 7-item sequence containing information describing one result
# column. The first two items (name and type_code) are mandatory, the other
# five are optional and are set to None if no meaningful values can be
# provided.
Column = collections.namedtuple(
"Column",
[
"name",
"type_code",
"display_size",
"internal_size",
"precision",
"scale",
"null_ok",
],
)
@_helpers.raise_on_closed("Operating on a closed cursor.")
class Cursor(object):
"""DB-API Cursor to Google BigQuery.
Args:
connection (google.cloud.bigquery.dbapi.Connection):
A DB-API connection to Google BigQuery.
"""
def __init__(self, connection):
self.connection = connection
self.description = None
# Per PEP 249: The attribute is -1 in case no .execute*() has been
# performed on the cursor or the rowcount of the last operation
# cannot be determined by the interface.
self.rowcount = -1
# Per PEP 249: The arraysize attribute defaults to 1, meaning to fetch
# a single row at a time. However, we deviate from that, and set the
# default to None, allowing the backend to automatically determine the
# most appropriate size.
self.arraysize = None
self._query_data = None
self._query_rows = None
self._closed = False
@property
def query_job(self) -> Optional[job.QueryJob]:
"""google.cloud.bigquery.job.query.QueryJob | None: The query job
created by the last ``execute*()`` call, if a query job was created.
.. note::
If the last ``execute*()`` call was ``executemany()``, this is the
last job created by ``executemany()``."""
rows = self._query_rows
if rows is None:
return None
job_id = rows.job_id
project = rows.project
location = rows.location
client = self.connection._client
if job_id is None:
return None
return client.get_job(job_id, location=location, project=project)
def close(self):
"""Mark the cursor as closed, preventing its further use."""
self._closed = True
def _set_description(self, schema):
"""Set description from schema.
Args:
schema (Sequence[google.cloud.bigquery.schema.SchemaField]):
A description of fields in the schema.
"""
if schema is None:
self.description = None
return
self.description = tuple(
Column(
name=field.name,
type_code=field.field_type,
display_size=None,
internal_size=None,
precision=None,
scale=None,
null_ok=field.is_nullable,
)
for field in schema
)
def _set_rowcount(self, rows):
"""Set the rowcount from a RowIterator.
Normally, this sets rowcount to the number of rows returned by the
query, but if it was a DML statement, it sets rowcount to the number
of modified rows.
Args:
query_results (google.cloud.bigquery.query._QueryResults):
Results of a query.
"""
total_rows = 0
num_dml_affected_rows = rows.num_dml_affected_rows
if rows.total_rows is not None and rows.total_rows > 0:
total_rows = rows.total_rows
if num_dml_affected_rows is not None and num_dml_affected_rows > 0:
total_rows = num_dml_affected_rows
self.rowcount = total_rows
def execute(self, operation, parameters=None, job_id=None, job_config=None):
"""Prepare and execute a database operation.
.. note::
When setting query parameters, values which are "text"
(``unicode`` in Python2, ``str`` in Python3) will use
the 'STRING' BigQuery type. Values which are "bytes" (``str`` in
Python2, ``bytes`` in Python3), will use using the 'BYTES' type.
A `~datetime.datetime` parameter without timezone information uses
the 'DATETIME' BigQuery type (example: Global Pi Day Celebration
March 14, 2017 at 1:59pm). A `~datetime.datetime` parameter with
timezone information uses the 'TIMESTAMP' BigQuery type (example:
a wedding on April 29, 2011 at 11am, British Summer Time).
For more information about BigQuery data types, see:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
``STRUCT``/``RECORD`` and ``REPEATED`` query parameters are not
yet supported. See:
https://github.com/GoogleCloudPlatform/google-cloud-python/issues/3524
Args:
operation (str): A Google BigQuery query string.
parameters (Union[Mapping[str, Any], Sequence[Any]]):
(Optional) dictionary or sequence of parameter values.
job_id (str | None):
(Optional and discouraged) The job ID to use when creating
the query job. For best performance and reliability, manually
setting a job ID is discouraged.
job_config (google.cloud.bigquery.job.QueryJobConfig):
(Optional) Extra configuration options for the query job.
"""
formatted_operation, parameter_types = _format_operation(operation, parameters)
self._execute(
formatted_operation, parameters, job_id, job_config, parameter_types
)
def _execute(
self, formatted_operation, parameters, job_id, job_config, parameter_types
):
self._query_data = None
self._query_results = None
client = self.connection._client
# The DB-API uses the pyformat formatting, since the way BigQuery does
# query parameters was not one of the standard options. Convert both
# the query and the parameters to the format expected by the client
# libraries.
query_parameters = _helpers.to_query_parameters(parameters, parameter_types)
config = job_config or job.QueryJobConfig()
config.query_parameters = query_parameters
# Start the query and wait for the query to finish.
try:
if job_id is not None:
rows = client.query(
formatted_operation,
job_config=job_config,
job_id=job_id,
).result(
page_size=self.arraysize,
)
else:
rows = client.query_and_wait(
formatted_operation,
job_config=config,
page_size=self.arraysize,
)
except google.cloud.exceptions.GoogleCloudError as exc:
raise exceptions.DatabaseError(exc)
self._query_rows = rows
self._set_description(rows.schema)
if config.dry_run:
self.rowcount = 0
else:
self._set_rowcount(rows)
def executemany(self, operation, seq_of_parameters):
"""Prepare and execute a database operation multiple times.
Args:
operation (str): A Google BigQuery query string.
seq_of_parameters (Union[Sequence[Mapping[str, Any], Sequence[Any]]]):
Sequence of many sets of parameter values.
"""
if seq_of_parameters:
rowcount = 0
# There's no reason to format the line more than once, as
# the operation only barely depends on the parameters. So
# we just use the first set of parameters. If there are
# different numbers or types of parameters, we'll error
# anyway.
formatted_operation, parameter_types = _format_operation(
operation, seq_of_parameters[0]
)
for parameters in seq_of_parameters:
self._execute(
formatted_operation, parameters, None, None, parameter_types
)
rowcount += self.rowcount
self.rowcount = rowcount
def _try_fetch(self, size=None):
"""Try to start fetching data, if not yet started.
Mutates self to indicate that iteration has started.
"""
if self._query_data is not None:
# Already started fetching the data.
return
rows = self._query_rows
if rows is None:
raise exceptions.InterfaceError(
"No query results: execute() must be called before fetch."
)
bqstorage_client = self.connection._bqstorage_client
if rows._should_use_bqstorage(
bqstorage_client,
create_bqstorage_client=False,
):
rows_iterable = self._bqstorage_fetch(bqstorage_client)
self._query_data = _helpers.to_bq_table_rows(rows_iterable)
return
self._query_data = iter(rows)
def _bqstorage_fetch(self, bqstorage_client):
"""Start fetching data with the BigQuery Storage API.
The method assumes that the data about the relevant query job already
exists internally.
Args:
bqstorage_client(\
google.cloud.bigquery_storage_v1.BigQueryReadClient \
):
A client tha know how to talk to the BigQuery Storage API.
Returns:
Iterable[Mapping]:
A sequence of rows, represented as dictionaries.
"""
# Hitting this code path with a BQ Storage client instance implies that
# bigquery_storage can indeed be imported here without errors.
from google.cloud import bigquery_storage
table_reference = self._query_rows._table
requested_session = bigquery_storage.types.ReadSession(
table=table_reference.to_bqstorage(),
data_format=bigquery_storage.types.DataFormat.ARROW,
)
if _ARROW_COMPRESSION_SUPPORT:
requested_session.read_options.arrow_serialization_options.buffer_compression = (
ArrowSerializationOptions.CompressionCodec.LZ4_FRAME
)
read_session = bqstorage_client.create_read_session(
parent="projects/{}".format(table_reference.project),
read_session=requested_session,
# a single stream only, as DB API is not well-suited for multithreading
max_stream_count=1,
)
if not read_session.streams:
return iter([]) # empty table, nothing to read
stream_name = read_session.streams[0].name
read_rows_stream = bqstorage_client.read_rows(stream_name)
rows_iterable = read_rows_stream.rows(read_session)
return rows_iterable
def fetchone(self):
"""Fetch a single row from the results of the last ``execute*()`` call.
.. note::
If a dry run query was executed, no rows are returned.
Returns:
Tuple:
A tuple representing a row or ``None`` if no more data is
available.
Raises:
google.cloud.bigquery.dbapi.InterfaceError: if called before ``execute()``.
"""
self._try_fetch()
try:
return next(self._query_data)
except StopIteration:
return None
def fetchmany(self, size=None):
"""Fetch multiple results from the last ``execute*()`` call.
.. note::
If a dry run query was executed, no rows are returned.
.. note::
The size parameter is not used for the request/response size.
Set the ``arraysize`` attribute before calling ``execute()`` to
set the batch size.
Args:
size (int):
(Optional) Maximum number of rows to return. Defaults to the
``arraysize`` property value. If ``arraysize`` is not set, it
defaults to ``1``.
Returns:
List[Tuple]: A list of rows.
Raises:
google.cloud.bigquery.dbapi.InterfaceError: if called before ``execute()``.
"""
if size is None:
# Since self.arraysize can be None (a deviation from PEP 249),
# use an actual PEP 249 default of 1 in such case (*some* number
# is needed here).
size = self.arraysize if self.arraysize else 1
self._try_fetch(size=size)
rows = []
for row in self._query_data:
rows.append(row)
if len(rows) >= size:
break
return rows
def fetchall(self):
"""Fetch all remaining results from the last ``execute*()`` call.
.. note::
If a dry run query was executed, no rows are returned.
Returns:
List[Tuple]: A list of all the rows in the results.
Raises:
google.cloud.bigquery.dbapi.InterfaceError: if called before ``execute()``.
"""
self._try_fetch()
return list(self._query_data)
def setinputsizes(self, sizes):
"""No-op, but for consistency raise an error if cursor is closed."""
def setoutputsize(self, size, column=None):
"""No-op, but for consistency raise an error if cursor is closed."""
def __iter__(self):
self._try_fetch()
return iter(self._query_data)
def _format_operation_list(operation, parameters):
"""Formats parameters in operation in the way BigQuery expects.
The input operation will be a query like ``SELECT %s`` and the output
will be a query like ``SELECT ?``.
Args:
operation (str): A Google BigQuery query string.
parameters (Sequence[Any]): Sequence of parameter values.
Returns:
str: A formatted query string.
Raises:
google.cloud.bigquery.dbapi.ProgrammingError:
if a parameter used in the operation is not found in the
``parameters`` argument.
"""
formatted_params = ["?" for _ in parameters]
try:
return operation % tuple(formatted_params)
except (TypeError, ValueError) as exc:
raise exceptions.ProgrammingError(exc)
def _format_operation_dict(operation, parameters):
"""Formats parameters in operation in the way BigQuery expects.
The input operation will be a query like ``SELECT %(namedparam)s`` and
the output will be a query like ``SELECT @namedparam``.
Args:
operation (str): A Google BigQuery query string.
parameters (Mapping[str, Any]): Dictionary of parameter values.
Returns:
str: A formatted query string.
Raises:
google.cloud.bigquery.dbapi.ProgrammingError:
if a parameter used in the operation is not found in the
``parameters`` argument.
"""
formatted_params = {}
for name in parameters:
escaped_name = name.replace("`", r"\`")
formatted_params[name] = "@`{}`".format(escaped_name)
try:
return operation % formatted_params
except (KeyError, ValueError, TypeError) as exc:
raise exceptions.ProgrammingError(exc)
def _format_operation(operation, parameters):
"""Formats parameters in operation in way BigQuery expects.
Args:
operation (str): A Google BigQuery query string.
parameters (Union[Mapping[str, Any], Sequence[Any]]):
Optional parameter values.
Returns:
str: A formatted query string.
Raises:
google.cloud.bigquery.dbapi.ProgrammingError:
if a parameter used in the operation is not found in the
``parameters`` argument.
"""
if parameters is None or len(parameters) == 0:
return operation.replace("%%", "%"), None # Still do percent de-escaping.
operation, parameter_types = _extract_types(operation)
if parameter_types is None:
raise exceptions.ProgrammingError(
f"Parameters were provided, but {repr(operation)} has no placeholders."
)
if isinstance(parameters, collections_abc.Mapping):
return _format_operation_dict(operation, parameters), parameter_types
return _format_operation_list(operation, parameters), parameter_types
def _extract_types(
operation,
extra_type_sub=re.compile(
r"""
(%*) # Extra %s. We'll deal with these in the replacement code
% # Beginning of replacement, %s, %(...)s
(?:\( # Begin of optional name and/or type
([^:)]*) # name
(?:: # ':' introduces type
( # start of type group
[a-zA-Z0-9_<>, ]+ # First part, no parens
(?: # start sets of parens + non-paren text
\([0-9 ,]+\) # comma-separated groups of digits in parens
# (e.g. string(10))
(?=[, >)]) # Must be followed by ,>) or space
[a-zA-Z0-9<>, ]* # Optional non-paren chars
)* # Can be zero or more of parens and following text
) # end of type group
)? # close type clause ":type"
\))? # End of optional name and/or type
s # End of replacement
""",
re.VERBOSE,
).sub,
):
"""Remove type information from parameter placeholders.
For every parameter of the form %(name:type)s, replace with %(name)s and add the
item name->type to dict that's returned.
Returns operation without type information and a dictionary of names and types.
"""
parameter_types = None
def repl(m):
nonlocal parameter_types
prefix, name, type_ = m.groups()
if len(prefix) % 2:
# The prefix has an odd number of %s, the last of which
# escapes the % we're looking for, so we don't want to
# change anything.
return m.group(0)
try:
if name:
if not parameter_types:
parameter_types = {}
if type_:
if name in parameter_types:
if type_ != parameter_types[name]:
raise exceptions.ProgrammingError(
f"Conflicting types for {name}: "
f"{parameter_types[name]} and {type_}."
)
else:
parameter_types[name] = type_
else:
if not isinstance(parameter_types, dict):
raise TypeError()
return f"{prefix}%({name})s"
else:
if parameter_types is None:
parameter_types = []
parameter_types.append(type_)
return f"{prefix}%s"
except (AttributeError, TypeError):
raise exceptions.ProgrammingError(
f"{repr(operation)} mixes named and unamed parameters."
)
return extra_type_sub(repl, operation), parameter_types

View File

@@ -0,0 +1,58 @@
# Copyright 2017 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Exceptions used in the Google BigQuery DB-API."""
class Warning(Exception):
"""Exception raised for important DB-API warnings."""
class Error(Exception):
"""Exception representing all non-warning DB-API errors."""
class InterfaceError(Error):
"""DB-API error related to the database interface."""
class DatabaseError(Error):
"""DB-API error related to the database."""
class DataError(DatabaseError):
"""DB-API error due to problems with the processed data."""
class OperationalError(DatabaseError):
"""DB-API error related to the database operation.
These errors are not necessarily under the control of the programmer.
"""
class IntegrityError(DatabaseError):
"""DB-API error when integrity of the database is affected."""
class InternalError(DatabaseError):
"""DB-API error when the database encounters an internal error."""
class ProgrammingError(DatabaseError):
"""DB-API exception raised for programming errors."""
class NotSupportedError(DatabaseError):
"""DB-API error for operations not supported by the database or API."""

View File

@@ -0,0 +1,96 @@
# Copyright 2017 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Types used in the Google BigQuery DB-API.
See `PEP-249`_ for details.
.. _PEP-249:
https://www.python.org/dev/peps/pep-0249/#type-objects-and-constructors
"""
import datetime
Date = datetime.date
Time = datetime.time
Timestamp = datetime.datetime
DateFromTicks = datetime.date.fromtimestamp
TimestampFromTicks = datetime.datetime.fromtimestamp
def Binary(data):
"""Contruct a DB-API binary value.
Args:
data (bytes-like): An object containing binary data and that
can be converted to bytes with the `bytes` builtin.
Returns:
bytes: The binary data as a bytes object.
"""
if isinstance(data, int):
# This is not the conversion we're looking for, because it
# will simply create a bytes object of the given size.
raise TypeError("cannot convert `int` object to binary")
try:
return bytes(data)
except TypeError:
if isinstance(data, str):
return data.encode("utf-8")
else:
raise
def TimeFromTicks(ticks, tz=None):
"""Construct a DB-API time value from the given ticks value.
Args:
ticks (float):
a number of seconds since the epoch; see the documentation of the
standard Python time module for details.
tz (datetime.tzinfo): (Optional) time zone to use for conversion
Returns:
datetime.time: time represented by ticks.
"""
dt = datetime.datetime.fromtimestamp(ticks, tz=tz)
return dt.timetz()
class _DBAPITypeObject(object):
"""DB-API type object which compares equal to many different strings.
See `PEP-249`_ for details.
.. _PEP-249:
https://www.python.org/dev/peps/pep-0249/#implementation-hints-for-module-authors
"""
def __init__(self, *values):
self.values = values
def __eq__(self, other):
return other in self.values
STRING = "STRING"
BINARY = _DBAPITypeObject("BYTES", "RECORD", "STRUCT")
NUMBER = _DBAPITypeObject(
"INTEGER", "INT64", "FLOAT", "FLOAT64", "NUMERIC", "BIGNUMERIC", "BOOLEAN", "BOOL"
)
DATETIME = _DBAPITypeObject("TIMESTAMP", "DATE", "TIME", "DATETIME")
ROWID = "ROWID"

View File

@@ -0,0 +1,84 @@
# Copyright 2015 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Define class for the custom encryption configuration."""
import copy
class EncryptionConfiguration(object):
"""Custom encryption configuration (e.g., Cloud KMS keys).
Args:
kms_key_name (str): resource ID of Cloud KMS key used for encryption
"""
def __init__(self, kms_key_name=None) -> None:
self._properties = {}
if kms_key_name is not None:
self._properties["kmsKeyName"] = kms_key_name
@property
def kms_key_name(self):
"""str: Resource ID of Cloud KMS key
Resource ID of Cloud KMS key or :data:`None` if using default
encryption.
"""
return self._properties.get("kmsKeyName")
@kms_key_name.setter
def kms_key_name(self, value):
self._properties["kmsKeyName"] = value
@classmethod
def from_api_repr(cls, resource):
"""Construct an encryption configuration from its API representation
Args:
resource (Dict[str, object]):
An encryption configuration representation as returned from
the API.
Returns:
google.cloud.bigquery.table.EncryptionConfiguration:
An encryption configuration parsed from ``resource``.
"""
config = cls()
config._properties = copy.deepcopy(resource)
return config
def to_api_repr(self):
"""Construct the API resource representation of this encryption
configuration.
Returns:
Dict[str, object]:
Encryption configuration as represented as an API resource
"""
return copy.deepcopy(self._properties)
def __eq__(self, other):
if not isinstance(other, EncryptionConfiguration):
return NotImplemented
return self.kms_key_name == other.kms_key_name
def __ne__(self, other):
return not self == other
def __hash__(self):
return hash(self.kms_key_name)
def __repr__(self):
return "EncryptionConfiguration({})".format(self.kms_key_name)

View File

@@ -0,0 +1,389 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import enum
class AutoRowIDs(enum.Enum):
"""How to handle automatic insert IDs when inserting rows as a stream."""
DISABLED = enum.auto()
GENERATE_UUID = enum.auto()
class Compression(str, enum.Enum):
"""The compression type to use for exported files. The default value is
:attr:`NONE`.
:attr:`DEFLATE` and :attr:`SNAPPY` are
only supported for Avro.
"""
GZIP = "GZIP"
"""Specifies GZIP format."""
DEFLATE = "DEFLATE"
"""Specifies DEFLATE format."""
SNAPPY = "SNAPPY"
"""Specifies SNAPPY format."""
ZSTD = "ZSTD"
"""Specifies ZSTD format."""
NONE = "NONE"
"""Specifies no compression."""
class DecimalTargetType:
"""The data types that could be used as a target type when converting decimal values.
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#DecimalTargetType
.. versionadded:: 2.21.0
"""
NUMERIC = "NUMERIC"
"""Decimal values could be converted to NUMERIC type."""
BIGNUMERIC = "BIGNUMERIC"
"""Decimal values could be converted to BIGNUMERIC type."""
STRING = "STRING"
"""Decimal values could be converted to STRING type."""
class CreateDisposition(object):
"""Specifies whether the job is allowed to create new tables. The default
value is :attr:`CREATE_IF_NEEDED`.
Creation, truncation and append actions occur as one atomic update
upon job completion.
"""
CREATE_IF_NEEDED = "CREATE_IF_NEEDED"
"""If the table does not exist, BigQuery creates the table."""
CREATE_NEVER = "CREATE_NEVER"
"""The table must already exist. If it does not, a 'notFound' error is
returned in the job result."""
class DefaultPandasDTypes(enum.Enum):
"""Default Pandas DataFrem DTypes to convert BigQuery data. These
Sentinel values are used instead of None to maintain backward compatibility,
and allow Pandas package is not available. For more information:
https://stackoverflow.com/a/60605919/101923
"""
BOOL_DTYPE = object()
"""Specifies default bool dtype"""
INT_DTYPE = object()
"""Specifies default integer dtype"""
DATE_DTYPE = object()
"""Specifies default date dtype"""
TIME_DTYPE = object()
"""Specifies default time dtype"""
RANGE_DATE_DTYPE = object()
"""Specifies default range date dtype"""
RANGE_DATETIME_DTYPE = object()
"""Specifies default range datetime dtype"""
RANGE_TIMESTAMP_DTYPE = object()
"""Specifies default range timestamp dtype"""
class DestinationFormat(object):
"""The exported file format. The default value is :attr:`CSV`.
Tables with nested or repeated fields cannot be exported as CSV.
"""
CSV = "CSV"
"""Specifies CSV format."""
NEWLINE_DELIMITED_JSON = "NEWLINE_DELIMITED_JSON"
"""Specifies newline delimited JSON format."""
AVRO = "AVRO"
"""Specifies Avro format."""
PARQUET = "PARQUET"
"""Specifies Parquet format."""
class Encoding(object):
"""The character encoding of the data. The default is :attr:`UTF_8`.
BigQuery decodes the data after the raw, binary data has been
split using the values of the quote and fieldDelimiter properties.
"""
UTF_8 = "UTF-8"
"""Specifies UTF-8 encoding."""
ISO_8859_1 = "ISO-8859-1"
"""Specifies ISO-8859-1 encoding."""
class QueryPriority(object):
"""Specifies a priority for the query. The default value is
:attr:`INTERACTIVE`.
"""
INTERACTIVE = "INTERACTIVE"
"""Specifies interactive priority."""
BATCH = "BATCH"
"""Specifies batch priority."""
class QueryApiMethod(str, enum.Enum):
"""API method used to start the query. The default value is
:attr:`INSERT`.
"""
INSERT = "INSERT"
"""Submit a query job by using the `jobs.insert REST API method
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/insert>`_.
This supports all job configuration options.
"""
QUERY = "QUERY"
"""Submit a query job by using the `jobs.query REST API method
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query>`_.
Differences from ``INSERT``:
* Many parameters and job configuration options, including job ID and
destination table, cannot be used
with this API method. See the `jobs.query REST API documentation
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query>`_ for
the complete list of supported configuration options.
* API blocks up to a specified timeout, waiting for the query to
finish.
* The full job resource (including job statistics) may not be available.
Call :meth:`~google.cloud.bigquery.job.QueryJob.reload` or
:meth:`~google.cloud.bigquery.client.Client.get_job` to get full job
statistics and configuration.
* :meth:`~google.cloud.bigquery.Client.query` can raise API exceptions if
the query fails, whereas the same errors don't appear until calling
:meth:`~google.cloud.bigquery.job.QueryJob.result` when the ``INSERT``
API method is used.
"""
class SchemaUpdateOption(object):
"""Specifies an update to the destination table schema as a side effect of
a load job.
"""
ALLOW_FIELD_ADDITION = "ALLOW_FIELD_ADDITION"
"""Allow adding a nullable field to the schema."""
ALLOW_FIELD_RELAXATION = "ALLOW_FIELD_RELAXATION"
"""Allow relaxing a required field in the original schema to nullable."""
class SourceFormat(object):
"""The format of the data files. The default value is :attr:`CSV`.
Note that the set of allowed values for loading data is different
than the set used for external data sources (see
:class:`~google.cloud.bigquery.external_config.ExternalSourceFormat`).
"""
CSV = "CSV"
"""Specifies CSV format."""
DATASTORE_BACKUP = "DATASTORE_BACKUP"
"""Specifies datastore backup format"""
NEWLINE_DELIMITED_JSON = "NEWLINE_DELIMITED_JSON"
"""Specifies newline delimited JSON format."""
AVRO = "AVRO"
"""Specifies Avro format."""
PARQUET = "PARQUET"
"""Specifies Parquet format."""
ORC = "ORC"
"""Specifies Orc format."""
class KeyResultStatementKind:
"""Determines which statement in the script represents the "key result".
The "key result" is used to populate the schema and query results of the script job.
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#keyresultstatementkind
"""
KEY_RESULT_STATEMENT_KIND_UNSPECIFIED = "KEY_RESULT_STATEMENT_KIND_UNSPECIFIED"
LAST = "LAST"
FIRST_SELECT = "FIRST_SELECT"
class StandardSqlTypeNames(str, enum.Enum):
"""Enum of allowed SQL type names in schema.SchemaField.
Datatype used in GoogleSQL.
"""
def _generate_next_value_(name, start, count, last_values):
return name
TYPE_KIND_UNSPECIFIED = enum.auto()
INT64 = enum.auto()
BOOL = enum.auto()
FLOAT64 = enum.auto()
STRING = enum.auto()
BYTES = enum.auto()
TIMESTAMP = enum.auto()
DATE = enum.auto()
TIME = enum.auto()
DATETIME = enum.auto()
INTERVAL = enum.auto()
GEOGRAPHY = enum.auto()
NUMERIC = enum.auto()
BIGNUMERIC = enum.auto()
JSON = enum.auto()
ARRAY = enum.auto()
STRUCT = enum.auto()
RANGE = enum.auto()
# NOTE: FOREIGN acts as a wrapper for data types
# not natively understood by BigQuery unless translated
FOREIGN = enum.auto()
class EntityTypes(str, enum.Enum):
"""Enum of allowed entity type names in AccessEntry"""
USER_BY_EMAIL = "userByEmail"
GROUP_BY_EMAIL = "groupByEmail"
DOMAIN = "domain"
DATASET = "dataset"
SPECIAL_GROUP = "specialGroup"
VIEW = "view"
IAM_MEMBER = "iamMember"
ROUTINE = "routine"
# See also: https://cloud.google.com/bigquery/data-types#legacy_sql_data_types
# and https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
class SqlTypeNames(str, enum.Enum):
"""Enum of allowed SQL type names in schema.SchemaField.
Datatype used in Legacy SQL.
"""
STRING = "STRING"
BYTES = "BYTES"
INTEGER = "INTEGER"
INT64 = "INTEGER"
FLOAT = "FLOAT"
FLOAT64 = "FLOAT"
DECIMAL = NUMERIC = "NUMERIC"
BIGDECIMAL = BIGNUMERIC = "BIGNUMERIC"
BOOLEAN = "BOOLEAN"
BOOL = "BOOLEAN"
GEOGRAPHY = "GEOGRAPHY" # NOTE: not available in legacy types
RECORD = "RECORD"
STRUCT = "RECORD"
TIMESTAMP = "TIMESTAMP"
DATE = "DATE"
TIME = "TIME"
DATETIME = "DATETIME"
INTERVAL = "INTERVAL" # NOTE: not available in legacy types
RANGE = "RANGE" # NOTE: not available in legacy types
# NOTE: FOREIGN acts as a wrapper for data types
# not natively understood by BigQuery unless translated
FOREIGN = "FOREIGN"
class WriteDisposition(object):
"""Specifies the action that occurs if destination table already exists.
The default value is :attr:`WRITE_APPEND`.
Each action is atomic and only occurs if BigQuery is able to complete
the job successfully. Creation, truncation and append actions occur as one
atomic update upon job completion.
"""
WRITE_APPEND = "WRITE_APPEND"
"""If the table already exists, BigQuery appends the data to the table."""
WRITE_TRUNCATE = "WRITE_TRUNCATE"
"""If the table already exists, BigQuery overwrites the table data."""
WRITE_EMPTY = "WRITE_EMPTY"
"""If the table already exists and contains data, a 'duplicate' error is
returned in the job result."""
class DeterminismLevel:
"""Specifies determinism level for JavaScript user-defined functions (UDFs).
https://cloud.google.com/bigquery/docs/reference/rest/v2/routines#DeterminismLevel
"""
DETERMINISM_LEVEL_UNSPECIFIED = "DETERMINISM_LEVEL_UNSPECIFIED"
"""The determinism of the UDF is unspecified."""
DETERMINISTIC = "DETERMINISTIC"
"""The UDF is deterministic, meaning that 2 function calls with the same inputs
always produce the same result, even across 2 query runs."""
NOT_DETERMINISTIC = "NOT_DETERMINISTIC"
"""The UDF is not deterministic."""
class RoundingMode(str, enum.Enum):
"""Rounding mode options that can be used when storing NUMERIC or BIGNUMERIC
values.
ROUNDING_MODE_UNSPECIFIED: will default to using ROUND_HALF_AWAY_FROM_ZERO.
ROUND_HALF_AWAY_FROM_ZERO: rounds half values away from zero when applying
precision and scale upon writing of NUMERIC and BIGNUMERIC values.
For Scale: 0
* 1.1, 1.2, 1.3, 1.4 => 1
* 1.5, 1.6, 1.7, 1.8, 1.9 => 2
ROUND_HALF_EVEN: rounds half values to the nearest even value when applying
precision and scale upon writing of NUMERIC and BIGNUMERIC values.
For Scale: 0
* 1.1, 1.2, 1.3, 1.4 => 1
* 1.5 => 2
* 1.6, 1.7, 1.8, 1.9 => 2
* 2.5 => 2
"""
def _generate_next_value_(name, start, count, last_values):
return name
ROUNDING_MODE_UNSPECIFIED = enum.auto()
ROUND_HALF_AWAY_FROM_ZERO = enum.auto()
ROUND_HALF_EVEN = enum.auto()

View File

@@ -0,0 +1,35 @@
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class BigQueryError(Exception):
"""Base class for all custom exceptions defined by the BigQuery client."""
class LegacyBigQueryStorageError(BigQueryError):
"""Raised when too old a version of BigQuery Storage extra is detected at runtime."""
class LegacyPyarrowError(BigQueryError):
"""Raised when too old a version of pyarrow package is detected at runtime."""
class BigQueryStorageNotFoundError(BigQueryError):
"""Raised when BigQuery Storage extra is not installed when trying to
import it.
"""
class LegacyPandasError(BigQueryError):
"""Raised when too old a version of pandas package is detected at runtime."""

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,147 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
from typing import Dict, Optional, Union
class AvroOptions:
"""Options if source format is set to AVRO."""
_SOURCE_FORMAT = "AVRO"
_RESOURCE_NAME = "avroOptions"
def __init__(self):
self._properties = {}
@property
def use_avro_logical_types(self) -> Optional[bool]:
"""[Optional] If sourceFormat is set to 'AVRO', indicates whether to
interpret logical types as the corresponding BigQuery data type (for
example, TIMESTAMP), instead of using the raw type (for example,
INTEGER).
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#AvroOptions.FIELDS.use_avro_logical_types
"""
return self._properties.get("useAvroLogicalTypes")
@use_avro_logical_types.setter
def use_avro_logical_types(self, value):
self._properties["useAvroLogicalTypes"] = value
@classmethod
def from_api_repr(cls, resource: Dict[str, bool]) -> "AvroOptions":
"""Factory: construct an instance from a resource dict.
Args:
resource (Dict[str, bool]):
Definition of a :class:`~.format_options.AvroOptions` instance in
the same representation as is returned from the API.
Returns:
:class:`~.format_options.AvroOptions`:
Configuration parsed from ``resource``.
"""
config = cls()
config._properties = copy.deepcopy(resource)
return config
def to_api_repr(self) -> dict:
"""Build an API representation of this object.
Returns:
Dict[str, bool]:
A dictionary in the format used by the BigQuery API.
"""
return copy.deepcopy(self._properties)
class ParquetOptions:
"""Additional options if the PARQUET source format is used."""
_SOURCE_FORMAT = "PARQUET"
_RESOURCE_NAME = "parquetOptions"
def __init__(self):
self._properties = {}
@property
def enum_as_string(self) -> bool:
"""Indicates whether to infer Parquet ENUM logical type as STRING instead of
BYTES by default.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ParquetOptions.FIELDS.enum_as_string
"""
return self._properties.get("enumAsString")
@enum_as_string.setter
def enum_as_string(self, value: bool) -> None:
self._properties["enumAsString"] = value
@property
def enable_list_inference(self) -> bool:
"""Indicates whether to use schema inference specifically for Parquet LIST
logical type.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ParquetOptions.FIELDS.enable_list_inference
"""
return self._properties.get("enableListInference")
@enable_list_inference.setter
def enable_list_inference(self, value: bool) -> None:
self._properties["enableListInference"] = value
@property
def map_target_type(self) -> Optional[Union[bool, str]]:
"""Indicates whether to simplify the representation of parquet maps to only show keys and values."""
return self._properties.get("mapTargetType")
@map_target_type.setter
def map_target_type(self, value: str) -> None:
"""Sets the map target type.
Args:
value: The map target type (eg ARRAY_OF_STRUCT).
"""
self._properties["mapTargetType"] = value
@classmethod
def from_api_repr(cls, resource: Dict[str, bool]) -> "ParquetOptions":
"""Factory: construct an instance from a resource dict.
Args:
resource (Dict[str, bool]):
Definition of a :class:`~.format_options.ParquetOptions` instance in
the same representation as is returned from the API.
Returns:
:class:`~.format_options.ParquetOptions`:
Configuration parsed from ``resource``.
"""
config = cls()
config._properties = copy.deepcopy(resource)
return config
def to_api_repr(self) -> dict:
"""Build an API representation of this object.
Returns:
Dict[str, bool]:
A dictionary in the format used by the BigQuery API.
"""
return copy.deepcopy(self._properties)

View File

@@ -0,0 +1,38 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BigQuery API IAM policy definitions
For all allowed roles and permissions, see:
https://cloud.google.com/bigquery/docs/access-control
"""
# BigQuery-specific IAM roles available for tables and views
BIGQUERY_DATA_EDITOR_ROLE = "roles/bigquery.dataEditor"
"""When applied to a table or view, this role provides permissions to
read and update data and metadata for the table or view."""
BIGQUERY_DATA_OWNER_ROLE = "roles/bigquery.dataOwner"
"""When applied to a table or view, this role provides permissions to
read and update data and metadata for the table or view, share the
table/view, and delete the table/view."""
BIGQUERY_DATA_VIEWER_ROLE = "roles/bigquery.dataViewer"
"""When applied to a table or view, this role provides permissions to
read data and metadata from the table or view."""
BIGQUERY_METADATA_VIEWER_ROLE = "roles/bigquery.metadataViewer"
"""When applied to a table or view, this role provides persmissions to
read metadata from the table or view."""

View File

@@ -0,0 +1,87 @@
# Copyright 2015 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Define API Jobs."""
from google.cloud.bigquery.job.base import _AsyncJob
from google.cloud.bigquery.job.base import _error_result_to_exception
from google.cloud.bigquery.job.base import _DONE_STATE
from google.cloud.bigquery.job.base import _JobConfig
from google.cloud.bigquery.job.base import _JobReference
from google.cloud.bigquery.job.base import ReservationUsage
from google.cloud.bigquery.job.base import ScriptStatistics
from google.cloud.bigquery.job.base import ScriptStackFrame
from google.cloud.bigquery.job.base import TransactionInfo
from google.cloud.bigquery.job.base import UnknownJob
from google.cloud.bigquery.job.copy_ import CopyJob
from google.cloud.bigquery.job.copy_ import CopyJobConfig
from google.cloud.bigquery.job.copy_ import OperationType
from google.cloud.bigquery.job.extract import ExtractJob
from google.cloud.bigquery.job.extract import ExtractJobConfig
from google.cloud.bigquery.job.load import LoadJob
from google.cloud.bigquery.job.load import LoadJobConfig
from google.cloud.bigquery.job.query import _contains_order_by
from google.cloud.bigquery.job.query import DmlStats
from google.cloud.bigquery.job.query import QueryJob
from google.cloud.bigquery.job.query import QueryJobConfig
from google.cloud.bigquery.job.query import QueryPlanEntry
from google.cloud.bigquery.job.query import QueryPlanEntryStep
from google.cloud.bigquery.job.query import ScriptOptions
from google.cloud.bigquery.job.query import TimelineEntry
from google.cloud.bigquery.enums import Compression
from google.cloud.bigquery.enums import CreateDisposition
from google.cloud.bigquery.enums import DestinationFormat
from google.cloud.bigquery.enums import Encoding
from google.cloud.bigquery.enums import QueryPriority
from google.cloud.bigquery.enums import SchemaUpdateOption
from google.cloud.bigquery.enums import SourceFormat
from google.cloud.bigquery.enums import WriteDisposition
# Include classes previously in job.py for backwards compatibility.
__all__ = [
"_AsyncJob",
"_error_result_to_exception",
"_DONE_STATE",
"_JobConfig",
"_JobReference",
"ReservationUsage",
"ScriptStatistics",
"ScriptStackFrame",
"UnknownJob",
"CopyJob",
"CopyJobConfig",
"OperationType",
"ExtractJob",
"ExtractJobConfig",
"LoadJob",
"LoadJobConfig",
"_contains_order_by",
"DmlStats",
"QueryJob",
"QueryJobConfig",
"QueryPlanEntry",
"QueryPlanEntryStep",
"ScriptOptions",
"TimelineEntry",
"Compression",
"CreateDisposition",
"DestinationFormat",
"Encoding",
"QueryPriority",
"SchemaUpdateOption",
"SourceFormat",
"TransactionInfo",
"WriteDisposition",
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,282 @@
# Copyright 2015 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Classes for copy jobs."""
import typing
from typing import Optional
from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
from google.cloud.bigquery import _helpers
from google.cloud.bigquery.table import TableReference
from google.cloud.bigquery.job.base import _AsyncJob
from google.cloud.bigquery.job.base import _JobConfig
from google.cloud.bigquery.job.base import _JobReference
class OperationType:
"""Different operation types supported in table copy job.
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#operationtype
"""
OPERATION_TYPE_UNSPECIFIED = "OPERATION_TYPE_UNSPECIFIED"
"""Unspecified operation type."""
COPY = "COPY"
"""The source and destination table have the same table type."""
SNAPSHOT = "SNAPSHOT"
"""The source table type is TABLE and the destination table type is SNAPSHOT."""
CLONE = "CLONE"
"""The source table type is TABLE and the destination table type is CLONE."""
RESTORE = "RESTORE"
"""The source table type is SNAPSHOT and the destination table type is TABLE."""
class CopyJobConfig(_JobConfig):
"""Configuration options for copy jobs.
All properties in this class are optional. Values which are :data:`None` ->
server defaults. Set properties on the constructed configuration by using
the property name as the name of a keyword argument.
"""
def __init__(self, **kwargs) -> None:
super(CopyJobConfig, self).__init__("copy", **kwargs)
@property
def create_disposition(self):
"""google.cloud.bigquery.job.CreateDisposition: Specifies behavior
for creating tables.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationTableCopy.FIELDS.create_disposition
"""
return self._get_sub_prop("createDisposition")
@create_disposition.setter
def create_disposition(self, value):
self._set_sub_prop("createDisposition", value)
@property
def write_disposition(self):
"""google.cloud.bigquery.job.WriteDisposition: Action that occurs if
the destination table already exists.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationTableCopy.FIELDS.write_disposition
"""
return self._get_sub_prop("writeDisposition")
@write_disposition.setter
def write_disposition(self, value):
self._set_sub_prop("writeDisposition", value)
@property
def destination_encryption_configuration(self):
"""google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom
encryption configuration for the destination table.
Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None`
if using default encryption.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationTableCopy.FIELDS.destination_encryption_configuration
"""
prop = self._get_sub_prop("destinationEncryptionConfiguration")
if prop is not None:
prop = EncryptionConfiguration.from_api_repr(prop)
return prop
@destination_encryption_configuration.setter
def destination_encryption_configuration(self, value):
api_repr = value
if value is not None:
api_repr = value.to_api_repr()
self._set_sub_prop("destinationEncryptionConfiguration", api_repr)
@property
def operation_type(self) -> str:
"""The operation to perform with this copy job.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationTableCopy.FIELDS.operation_type
"""
return self._get_sub_prop(
"operationType", OperationType.OPERATION_TYPE_UNSPECIFIED
)
@operation_type.setter
def operation_type(self, value: Optional[str]):
if value is None:
value = OperationType.OPERATION_TYPE_UNSPECIFIED
self._set_sub_prop("operationType", value)
@property
def destination_expiration_time(self) -> str:
"""google.cloud.bigquery.job.DestinationExpirationTime: The time when the
destination table expires. Expired tables will be deleted and their storage reclaimed.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationTableCopy.FIELDS.destination_expiration_time
"""
return self._get_sub_prop("destinationExpirationTime")
@destination_expiration_time.setter
def destination_expiration_time(self, value: str):
self._set_sub_prop("destinationExpirationTime", value)
class CopyJob(_AsyncJob):
"""Asynchronous job: copy data into a table from other tables.
Args:
job_id (str): the job's ID, within the project belonging to ``client``.
sources (List[google.cloud.bigquery.table.TableReference]): Table from which data is to be loaded.
destination (google.cloud.bigquery.table.TableReference): Table into which data is to be loaded.
client (google.cloud.bigquery.client.Client):
A client which holds credentials and project configuration
for the dataset (which requires a project).
job_config (Optional[google.cloud.bigquery.job.CopyJobConfig]):
Extra configuration options for the copy job.
"""
_JOB_TYPE = "copy"
_CONFIG_CLASS = CopyJobConfig
def __init__(self, job_id, sources, destination, client, job_config=None):
super(CopyJob, self).__init__(job_id, client)
if job_config is not None:
self._properties["configuration"] = job_config._properties
if destination:
_helpers._set_sub_prop(
self._properties,
["configuration", "copy", "destinationTable"],
destination.to_api_repr(),
)
if sources:
source_resources = [source.to_api_repr() for source in sources]
_helpers._set_sub_prop(
self._properties,
["configuration", "copy", "sourceTables"],
source_resources,
)
@property
def configuration(self) -> CopyJobConfig:
"""The configuration for this copy job."""
return typing.cast(CopyJobConfig, super().configuration)
@property
def destination(self):
"""google.cloud.bigquery.table.TableReference: Table into which data
is to be loaded.
"""
return TableReference.from_api_repr(
_helpers._get_sub_prop(
self._properties, ["configuration", "copy", "destinationTable"]
)
)
@property
def sources(self):
"""List[google.cloud.bigquery.table.TableReference]): Table(s) from
which data is to be loaded.
"""
source_configs = _helpers._get_sub_prop(
self._properties, ["configuration", "copy", "sourceTables"]
)
if source_configs is None:
single = _helpers._get_sub_prop(
self._properties, ["configuration", "copy", "sourceTable"]
)
if single is None:
raise KeyError("Resource missing 'sourceTables' / 'sourceTable'")
source_configs = [single]
sources = []
for source_config in source_configs:
table_ref = TableReference.from_api_repr(source_config)
sources.append(table_ref)
return sources
@property
def create_disposition(self):
"""See
:attr:`google.cloud.bigquery.job.CopyJobConfig.create_disposition`.
"""
return self.configuration.create_disposition
@property
def write_disposition(self):
"""See
:attr:`google.cloud.bigquery.job.CopyJobConfig.write_disposition`.
"""
return self.configuration.write_disposition
@property
def destination_encryption_configuration(self):
"""google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom
encryption configuration for the destination table.
Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None`
if using default encryption.
See
:attr:`google.cloud.bigquery.job.CopyJobConfig.destination_encryption_configuration`.
"""
return self.configuration.destination_encryption_configuration
def to_api_repr(self):
"""Generate a resource for :meth:`_begin`."""
# Exclude statistics, if set.
return {
"jobReference": self._properties["jobReference"],
"configuration": self._properties["configuration"],
}
@classmethod
def from_api_repr(cls, resource, client):
"""Factory: construct a job given its API representation
.. note::
This method assumes that the project found in the resource matches
the client's project.
Args:
resource (Dict): dataset job representation returned from the API
client (google.cloud.bigquery.client.Client):
Client which holds credentials and project
configuration for the dataset.
Returns:
google.cloud.bigquery.job.CopyJob: Job parsed from ``resource``.
"""
cls._check_resource_config(resource)
job_ref = _JobReference._from_api_repr(resource["jobReference"])
job = cls(job_ref, None, None, client=client)
job._set_properties(resource)
return job

View File

@@ -0,0 +1,271 @@
# Copyright 2015 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Classes for extract (export) jobs."""
import typing
from google.cloud.bigquery import _helpers
from google.cloud.bigquery.model import ModelReference
from google.cloud.bigquery.table import Table
from google.cloud.bigquery.table import TableListItem
from google.cloud.bigquery.table import TableReference
from google.cloud.bigquery.job.base import _AsyncJob
from google.cloud.bigquery.job.base import _JobConfig
from google.cloud.bigquery.job.base import _JobReference
class ExtractJobConfig(_JobConfig):
"""Configuration options for extract jobs.
All properties in this class are optional. Values which are :data:`None` ->
server defaults. Set properties on the constructed configuration by using
the property name as the name of a keyword argument.
"""
def __init__(self, **kwargs):
super(ExtractJobConfig, self).__init__("extract", **kwargs)
@property
def compression(self):
"""google.cloud.bigquery.job.Compression: Compression type to use for
exported files.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationExtract.FIELDS.compression
"""
return self._get_sub_prop("compression")
@compression.setter
def compression(self, value):
self._set_sub_prop("compression", value)
@property
def destination_format(self):
"""google.cloud.bigquery.job.DestinationFormat: Exported file format.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationExtract.FIELDS.destination_format
"""
return self._get_sub_prop("destinationFormat")
@destination_format.setter
def destination_format(self, value):
self._set_sub_prop("destinationFormat", value)
@property
def field_delimiter(self):
"""str: Delimiter to use between fields in the exported data.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationExtract.FIELDS.field_delimiter
"""
return self._get_sub_prop("fieldDelimiter")
@field_delimiter.setter
def field_delimiter(self, value):
self._set_sub_prop("fieldDelimiter", value)
@property
def print_header(self):
"""bool: Print a header row in the exported data.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationExtract.FIELDS.print_header
"""
return self._get_sub_prop("printHeader")
@print_header.setter
def print_header(self, value):
self._set_sub_prop("printHeader", value)
@property
def use_avro_logical_types(self):
"""bool: For loads of Avro data, governs whether Avro logical types are
converted to their corresponding BigQuery types (e.g. TIMESTAMP) rather than
raw types (e.g. INTEGER).
"""
return self._get_sub_prop("useAvroLogicalTypes")
@use_avro_logical_types.setter
def use_avro_logical_types(self, value):
self._set_sub_prop("useAvroLogicalTypes", bool(value))
class ExtractJob(_AsyncJob):
"""Asynchronous job: extract data from a table into Cloud Storage.
Args:
job_id (str): the job's ID.
source (Union[ \
google.cloud.bigquery.table.TableReference, \
google.cloud.bigquery.model.ModelReference \
]):
Table or Model from which data is to be loaded or extracted.
destination_uris (List[str]):
URIs describing where the extracted data will be written in Cloud
Storage, using the format ``gs://<bucket_name>/<object_name_or_glob>``.
client (google.cloud.bigquery.client.Client):
A client which holds credentials and project configuration.
job_config (Optional[google.cloud.bigquery.job.ExtractJobConfig]):
Extra configuration options for the extract job.
"""
_JOB_TYPE = "extract"
_CONFIG_CLASS = ExtractJobConfig
def __init__(self, job_id, source, destination_uris, client, job_config=None):
super(ExtractJob, self).__init__(job_id, client)
if job_config is not None:
self._properties["configuration"] = job_config._properties
if source:
source_ref = {"projectId": source.project, "datasetId": source.dataset_id}
if isinstance(source, (Table, TableListItem, TableReference)):
source_ref["tableId"] = source.table_id
source_key = "sourceTable"
else:
source_ref["modelId"] = source.model_id
source_key = "sourceModel"
_helpers._set_sub_prop(
self._properties, ["configuration", "extract", source_key], source_ref
)
if destination_uris:
_helpers._set_sub_prop(
self._properties,
["configuration", "extract", "destinationUris"],
destination_uris,
)
@property
def configuration(self) -> ExtractJobConfig:
"""The configuration for this extract job."""
return typing.cast(ExtractJobConfig, super().configuration)
@property
def source(self):
"""Union[ \
google.cloud.bigquery.table.TableReference, \
google.cloud.bigquery.model.ModelReference \
]: Table or Model from which data is to be loaded or extracted.
"""
source_config = _helpers._get_sub_prop(
self._properties, ["configuration", "extract", "sourceTable"]
)
if source_config:
return TableReference.from_api_repr(source_config)
else:
source_config = _helpers._get_sub_prop(
self._properties, ["configuration", "extract", "sourceModel"]
)
return ModelReference.from_api_repr(source_config)
@property
def destination_uris(self):
"""List[str]: URIs describing where the extracted data will be
written in Cloud Storage, using the format
``gs://<bucket_name>/<object_name_or_glob>``.
"""
return _helpers._get_sub_prop(
self._properties, ["configuration", "extract", "destinationUris"]
)
@property
def compression(self):
"""See
:attr:`google.cloud.bigquery.job.ExtractJobConfig.compression`.
"""
return self.configuration.compression
@property
def destination_format(self):
"""See
:attr:`google.cloud.bigquery.job.ExtractJobConfig.destination_format`.
"""
return self.configuration.destination_format
@property
def field_delimiter(self):
"""See
:attr:`google.cloud.bigquery.job.ExtractJobConfig.field_delimiter`.
"""
return self.configuration.field_delimiter
@property
def print_header(self):
"""See
:attr:`google.cloud.bigquery.job.ExtractJobConfig.print_header`.
"""
return self.configuration.print_header
@property
def destination_uri_file_counts(self):
"""Return file counts from job statistics, if present.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics4.FIELDS.destination_uri_file_counts
Returns:
List[int]:
A list of integer counts, each representing the number of files
per destination URI or URI pattern specified in the extract
configuration. These values will be in the same order as the URIs
specified in the 'destinationUris' field. Returns None if job is
not yet complete.
"""
counts = self._job_statistics().get("destinationUriFileCounts")
if counts is not None:
return [int(count) for count in counts]
return None
def to_api_repr(self):
"""Generate a resource for :meth:`_begin`."""
# Exclude statistics, if set.
return {
"jobReference": self._properties["jobReference"],
"configuration": self._properties["configuration"],
}
@classmethod
def from_api_repr(cls, resource: dict, client) -> "ExtractJob":
"""Factory: construct a job given its API representation
.. note::
This method assumes that the project found in the resource matches
the client's project.
Args:
resource (Dict): dataset job representation returned from the API
client (google.cloud.bigquery.client.Client):
Client which holds credentials and project
configuration for the dataset.
Returns:
google.cloud.bigquery.job.ExtractJob: Job parsed from ``resource``.
"""
cls._check_resource_config(resource)
job_ref = _JobReference._from_api_repr(resource["jobReference"])
job = cls(job_ref, None, None, client=client)
job._set_properties(resource)
return job

View File

@@ -0,0 +1,985 @@
# Copyright 2015 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Classes for load jobs."""
import typing
from typing import FrozenSet, List, Iterable, Optional
from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
from google.cloud.bigquery.external_config import HivePartitioningOptions
from google.cloud.bigquery.format_options import ParquetOptions
from google.cloud.bigquery import _helpers
from google.cloud.bigquery.schema import SchemaField
from google.cloud.bigquery.schema import _to_schema_fields
from google.cloud.bigquery.table import RangePartitioning
from google.cloud.bigquery.table import TableReference
from google.cloud.bigquery.table import TimePartitioning
from google.cloud.bigquery.job.base import _AsyncJob
from google.cloud.bigquery.job.base import _JobConfig
from google.cloud.bigquery.job.base import _JobReference
from google.cloud.bigquery.query import ConnectionProperty
class ColumnNameCharacterMap:
"""Indicates the character map used for column names.
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#columnnamecharactermap
"""
COLUMN_NAME_CHARACTER_MAP_UNSPECIFIED = "COLUMN_NAME_CHARACTER_MAP_UNSPECIFIED"
"""Unspecified column name character map."""
STRICT = "STRICT"
"""Support flexible column name and reject invalid column names."""
V1 = "V1"
""" Support alphanumeric + underscore characters and names must start with
a letter or underscore. Invalid column names will be normalized."""
V2 = "V2"
"""Support flexible column name. Invalid column names will be normalized."""
class LoadJobConfig(_JobConfig):
"""Configuration options for load jobs.
Set properties on the constructed configuration by using the property name
as the name of a keyword argument. Values which are unset or :data:`None`
use the BigQuery REST API default values. See the `BigQuery REST API
reference documentation
<https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad>`_
for a list of default values.
Required options differ based on the
:attr:`~google.cloud.bigquery.job.LoadJobConfig.source_format` value.
For example, the BigQuery API's default value for
:attr:`~google.cloud.bigquery.job.LoadJobConfig.source_format` is ``"CSV"``.
When loading a CSV file, either
:attr:`~google.cloud.bigquery.job.LoadJobConfig.schema` must be set or
:attr:`~google.cloud.bigquery.job.LoadJobConfig.autodetect` must be set to
:data:`True`.
"""
def __init__(self, **kwargs) -> None:
super(LoadJobConfig, self).__init__("load", **kwargs)
@property
def allow_jagged_rows(self):
"""Optional[bool]: Allow missing trailing optional columns (CSV only).
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.allow_jagged_rows
"""
return self._get_sub_prop("allowJaggedRows")
@allow_jagged_rows.setter
def allow_jagged_rows(self, value):
self._set_sub_prop("allowJaggedRows", value)
@property
def allow_quoted_newlines(self):
"""Optional[bool]: Allow quoted data containing newline characters (CSV only).
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.allow_quoted_newlines
"""
return self._get_sub_prop("allowQuotedNewlines")
@allow_quoted_newlines.setter
def allow_quoted_newlines(self, value):
self._set_sub_prop("allowQuotedNewlines", value)
@property
def autodetect(self):
"""Optional[bool]: Automatically infer the schema from a sample of the data.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.autodetect
"""
return self._get_sub_prop("autodetect")
@autodetect.setter
def autodetect(self, value):
self._set_sub_prop("autodetect", value)
@property
def clustering_fields(self):
"""Optional[List[str]]: Fields defining clustering for the table
(Defaults to :data:`None`).
Clustering fields are immutable after table creation.
.. note::
BigQuery supports clustering for both partitioned and
non-partitioned tables.
"""
prop = self._get_sub_prop("clustering")
if prop is not None:
return list(prop.get("fields", ()))
@clustering_fields.setter
def clustering_fields(self, value):
"""Optional[List[str]]: Fields defining clustering for the table
(Defaults to :data:`None`).
"""
if value is not None:
self._set_sub_prop("clustering", {"fields": value})
else:
self._del_sub_prop("clustering")
@property
def connection_properties(self) -> List[ConnectionProperty]:
"""Connection properties.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.connection_properties
.. versionadded:: 3.7.0
"""
resource = self._get_sub_prop("connectionProperties", [])
return [ConnectionProperty.from_api_repr(prop) for prop in resource]
@connection_properties.setter
def connection_properties(self, value: Iterable[ConnectionProperty]):
self._set_sub_prop(
"connectionProperties",
[prop.to_api_repr() for prop in value],
)
@property
def create_disposition(self):
"""Optional[google.cloud.bigquery.job.CreateDisposition]: Specifies behavior
for creating tables.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.create_disposition
"""
return self._get_sub_prop("createDisposition")
@create_disposition.setter
def create_disposition(self, value):
self._set_sub_prop("createDisposition", value)
@property
def create_session(self) -> Optional[bool]:
"""[Preview] If :data:`True`, creates a new session, where
:attr:`~google.cloud.bigquery.job.LoadJob.session_info` will contain a
random server generated session id.
If :data:`False`, runs load job with an existing ``session_id`` passed in
:attr:`~google.cloud.bigquery.job.LoadJobConfig.connection_properties`,
otherwise runs load job in non-session mode.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.create_session
.. versionadded:: 3.7.0
"""
return self._get_sub_prop("createSession")
@create_session.setter
def create_session(self, value: Optional[bool]):
self._set_sub_prop("createSession", value)
@property
def decimal_target_types(self) -> Optional[FrozenSet[str]]:
"""Possible SQL data types to which the source decimal values are converted.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.decimal_target_types
.. versionadded:: 2.21.0
"""
prop = self._get_sub_prop("decimalTargetTypes")
if prop is not None:
prop = frozenset(prop)
return prop
@decimal_target_types.setter
def decimal_target_types(self, value: Optional[Iterable[str]]):
if value is not None:
self._set_sub_prop("decimalTargetTypes", list(value))
else:
self._del_sub_prop("decimalTargetTypes")
@property
def destination_encryption_configuration(self):
"""Optional[google.cloud.bigquery.encryption_configuration.EncryptionConfiguration]: Custom
encryption configuration for the destination table.
Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None`
if using default encryption.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.destination_encryption_configuration
"""
prop = self._get_sub_prop("destinationEncryptionConfiguration")
if prop is not None:
prop = EncryptionConfiguration.from_api_repr(prop)
return prop
@destination_encryption_configuration.setter
def destination_encryption_configuration(self, value):
api_repr = value
if value is not None:
api_repr = value.to_api_repr()
self._set_sub_prop("destinationEncryptionConfiguration", api_repr)
else:
self._del_sub_prop("destinationEncryptionConfiguration")
@property
def destination_table_description(self):
"""Optional[str]: Description of the destination table.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#DestinationTableProperties.FIELDS.description
"""
prop = self._get_sub_prop("destinationTableProperties")
if prop is not None:
return prop["description"]
@destination_table_description.setter
def destination_table_description(self, value):
keys = [self._job_type, "destinationTableProperties", "description"]
if value is not None:
_helpers._set_sub_prop(self._properties, keys, value)
else:
_helpers._del_sub_prop(self._properties, keys)
@property
def destination_table_friendly_name(self):
"""Optional[str]: Name given to destination table.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#DestinationTableProperties.FIELDS.friendly_name
"""
prop = self._get_sub_prop("destinationTableProperties")
if prop is not None:
return prop["friendlyName"]
@destination_table_friendly_name.setter
def destination_table_friendly_name(self, value):
keys = [self._job_type, "destinationTableProperties", "friendlyName"]
if value is not None:
_helpers._set_sub_prop(self._properties, keys, value)
else:
_helpers._del_sub_prop(self._properties, keys)
@property
def encoding(self):
"""Optional[google.cloud.bigquery.job.Encoding]: The character encoding of the
data.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.encoding
"""
return self._get_sub_prop("encoding")
@encoding.setter
def encoding(self, value):
self._set_sub_prop("encoding", value)
@property
def field_delimiter(self):
"""Optional[str]: The separator for fields in a CSV file.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.field_delimiter
"""
return self._get_sub_prop("fieldDelimiter")
@field_delimiter.setter
def field_delimiter(self, value):
self._set_sub_prop("fieldDelimiter", value)
@property
def hive_partitioning(self):
"""Optional[:class:`~.external_config.HivePartitioningOptions`]: [Beta] When set, \
it configures hive partitioning support.
.. note::
**Experimental**. This feature is experimental and might change or
have limited support.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.hive_partitioning_options
"""
prop = self._get_sub_prop("hivePartitioningOptions")
if prop is None:
return None
return HivePartitioningOptions.from_api_repr(prop)
@hive_partitioning.setter
def hive_partitioning(self, value):
if value is not None:
if isinstance(value, HivePartitioningOptions):
value = value.to_api_repr()
else:
raise TypeError("Expected a HivePartitioningOptions instance or None.")
self._set_sub_prop("hivePartitioningOptions", value)
@property
def ignore_unknown_values(self):
"""Optional[bool]: Ignore extra values not represented in the table schema.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.ignore_unknown_values
"""
return self._get_sub_prop("ignoreUnknownValues")
@ignore_unknown_values.setter
def ignore_unknown_values(self, value):
self._set_sub_prop("ignoreUnknownValues", value)
@property
def json_extension(self):
"""Optional[str]: The extension to use for writing JSON data to BigQuery. Only supports GeoJSON currently.
See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.json_extension
"""
return self._get_sub_prop("jsonExtension")
@json_extension.setter
def json_extension(self, value):
self._set_sub_prop("jsonExtension", value)
@property
def max_bad_records(self):
"""Optional[int]: Number of invalid rows to ignore.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.max_bad_records
"""
return _helpers._int_or_none(self._get_sub_prop("maxBadRecords"))
@max_bad_records.setter
def max_bad_records(self, value):
self._set_sub_prop("maxBadRecords", value)
@property
def null_marker(self):
"""Optional[str]: Represents a null value (CSV only).
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.null_marker
"""
return self._get_sub_prop("nullMarker")
@null_marker.setter
def null_marker(self, value):
self._set_sub_prop("nullMarker", value)
@property
def preserve_ascii_control_characters(self):
"""Optional[bool]: Preserves the embedded ASCII control characters when sourceFormat is set to CSV.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.preserve_ascii_control_characters
"""
return self._get_sub_prop("preserveAsciiControlCharacters")
@preserve_ascii_control_characters.setter
def preserve_ascii_control_characters(self, value):
self._set_sub_prop("preserveAsciiControlCharacters", bool(value))
@property
def projection_fields(self) -> Optional[List[str]]:
"""Optional[List[str]]: If
:attr:`google.cloud.bigquery.job.LoadJobConfig.source_format` is set to
"DATASTORE_BACKUP", indicates which entity properties to load into
BigQuery from a Cloud Datastore backup.
Property names are case sensitive and must be top-level properties. If
no properties are specified, BigQuery loads all properties. If any
named property isn't found in the Cloud Datastore backup, an invalid
error is returned in the job result.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.projection_fields
"""
return self._get_sub_prop("projectionFields")
@projection_fields.setter
def projection_fields(self, value: Optional[List[str]]):
self._set_sub_prop("projectionFields", value)
@property
def quote_character(self):
"""Optional[str]: Character used to quote data sections (CSV only).
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.quote
"""
return self._get_sub_prop("quote")
@quote_character.setter
def quote_character(self, value):
self._set_sub_prop("quote", value)
@property
def range_partitioning(self):
"""Optional[google.cloud.bigquery.table.RangePartitioning]:
Configures range-based partitioning for destination table.
.. note::
**Beta**. The integer range partitioning feature is in a
pre-release state and might change or have limited support.
Only specify at most one of
:attr:`~google.cloud.bigquery.job.LoadJobConfig.time_partitioning` or
:attr:`~google.cloud.bigquery.job.LoadJobConfig.range_partitioning`.
Raises:
ValueError:
If the value is not
:class:`~google.cloud.bigquery.table.RangePartitioning` or
:data:`None`.
"""
resource = self._get_sub_prop("rangePartitioning")
if resource is not None:
return RangePartitioning(_properties=resource)
@range_partitioning.setter
def range_partitioning(self, value):
resource = value
if isinstance(value, RangePartitioning):
resource = value._properties
elif value is not None:
raise ValueError(
"Expected value to be RangePartitioning or None, got {}.".format(value)
)
self._set_sub_prop("rangePartitioning", resource)
@property
def reference_file_schema_uri(self):
"""Optional[str]:
When creating an external table, the user can provide a reference file with the
table schema. This is enabled for the following formats:
AVRO, PARQUET, ORC
"""
return self._get_sub_prop("referenceFileSchemaUri")
@reference_file_schema_uri.setter
def reference_file_schema_uri(self, value):
return self._set_sub_prop("referenceFileSchemaUri", value)
@property
def schema(self):
"""Optional[Sequence[Union[ \
:class:`~google.cloud.bigquery.schema.SchemaField`, \
Mapping[str, Any] \
]]]: Schema of the destination table.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.schema
"""
schema = _helpers._get_sub_prop(self._properties, ["load", "schema", "fields"])
if schema is None:
return
return [SchemaField.from_api_repr(field) for field in schema]
@schema.setter
def schema(self, value):
if value is None:
self._del_sub_prop("schema")
return
value = _to_schema_fields(value)
_helpers._set_sub_prop(
self._properties,
["load", "schema", "fields"],
[field.to_api_repr() for field in value],
)
@property
def schema_update_options(self):
"""Optional[List[google.cloud.bigquery.job.SchemaUpdateOption]]: Specifies
updates to the destination table schema to allow as a side effect of
the load job.
"""
return self._get_sub_prop("schemaUpdateOptions")
@schema_update_options.setter
def schema_update_options(self, values):
self._set_sub_prop("schemaUpdateOptions", values)
@property
def skip_leading_rows(self):
"""Optional[int]: Number of rows to skip when reading data (CSV only).
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.skip_leading_rows
"""
return _helpers._int_or_none(self._get_sub_prop("skipLeadingRows"))
@skip_leading_rows.setter
def skip_leading_rows(self, value):
self._set_sub_prop("skipLeadingRows", str(value))
@property
def source_format(self):
"""Optional[google.cloud.bigquery.job.SourceFormat]: File format of the data.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_format
"""
return self._get_sub_prop("sourceFormat")
@source_format.setter
def source_format(self, value):
self._set_sub_prop("sourceFormat", value)
@property
def time_partitioning(self):
"""Optional[google.cloud.bigquery.table.TimePartitioning]: Specifies time-based
partitioning for the destination table.
Only specify at most one of
:attr:`~google.cloud.bigquery.job.LoadJobConfig.time_partitioning` or
:attr:`~google.cloud.bigquery.job.LoadJobConfig.range_partitioning`.
"""
prop = self._get_sub_prop("timePartitioning")
if prop is not None:
prop = TimePartitioning.from_api_repr(prop)
return prop
@time_partitioning.setter
def time_partitioning(self, value):
api_repr = value
if value is not None:
api_repr = value.to_api_repr()
self._set_sub_prop("timePartitioning", api_repr)
else:
self._del_sub_prop("timePartitioning")
@property
def use_avro_logical_types(self):
"""Optional[bool]: For loads of Avro data, governs whether Avro logical types are
converted to their corresponding BigQuery types (e.g. TIMESTAMP) rather than
raw types (e.g. INTEGER).
"""
return self._get_sub_prop("useAvroLogicalTypes")
@use_avro_logical_types.setter
def use_avro_logical_types(self, value):
self._set_sub_prop("useAvroLogicalTypes", bool(value))
@property
def write_disposition(self):
"""Optional[google.cloud.bigquery.job.WriteDisposition]: Action that occurs if
the destination table already exists.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.write_disposition
"""
return self._get_sub_prop("writeDisposition")
@write_disposition.setter
def write_disposition(self, value):
self._set_sub_prop("writeDisposition", value)
@property
def parquet_options(self):
"""Optional[google.cloud.bigquery.format_options.ParquetOptions]: Additional
properties to set if ``sourceFormat`` is set to PARQUET.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.parquet_options
"""
prop = self._get_sub_prop("parquetOptions")
if prop is not None:
prop = ParquetOptions.from_api_repr(prop)
return prop
@parquet_options.setter
def parquet_options(self, value):
if value is not None:
self._set_sub_prop("parquetOptions", value.to_api_repr())
else:
self._del_sub_prop("parquetOptions")
@property
def column_name_character_map(self) -> str:
"""Optional[google.cloud.bigquery.job.ColumnNameCharacterMap]:
Character map supported for column names in CSV/Parquet loads. Defaults
to STRICT and can be overridden by Project Config Service. Using this
option with unsupported load formats will result in an error.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.column_name_character_map
"""
return self._get_sub_prop(
"columnNameCharacterMap",
ColumnNameCharacterMap.COLUMN_NAME_CHARACTER_MAP_UNSPECIFIED,
)
@column_name_character_map.setter
def column_name_character_map(self, value: Optional[str]):
if value is None:
value = ColumnNameCharacterMap.COLUMN_NAME_CHARACTER_MAP_UNSPECIFIED
self._set_sub_prop("columnNameCharacterMap", value)
class LoadJob(_AsyncJob):
"""Asynchronous job for loading data into a table.
Can load from Google Cloud Storage URIs or from a file.
Args:
job_id (str): the job's ID
source_uris (Optional[Sequence[str]]):
URIs of one or more data files to be loaded. See
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_uris
for supported URI formats. Pass None for jobs that load from a file.
destination (google.cloud.bigquery.table.TableReference): reference to table into which data is to be loaded.
client (google.cloud.bigquery.client.Client):
A client which holds credentials and project configuration
for the dataset (which requires a project).
"""
_JOB_TYPE = "load"
_CONFIG_CLASS = LoadJobConfig
def __init__(self, job_id, source_uris, destination, client, job_config=None):
super(LoadJob, self).__init__(job_id, client)
if job_config is not None:
self._properties["configuration"] = job_config._properties
if source_uris is not None:
_helpers._set_sub_prop(
self._properties, ["configuration", "load", "sourceUris"], source_uris
)
if destination is not None:
_helpers._set_sub_prop(
self._properties,
["configuration", "load", "destinationTable"],
destination.to_api_repr(),
)
@property
def configuration(self) -> LoadJobConfig:
"""The configuration for this load job."""
return typing.cast(LoadJobConfig, super().configuration)
@property
def destination(self):
"""google.cloud.bigquery.table.TableReference: table where loaded rows are written
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.destination_table
"""
dest_config = _helpers._get_sub_prop(
self._properties, ["configuration", "load", "destinationTable"]
)
return TableReference.from_api_repr(dest_config)
@property
def source_uris(self):
"""Optional[Sequence[str]]: URIs of data files to be loaded. See
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_uris
for supported URI formats. None for jobs that load from a file.
"""
return _helpers._get_sub_prop(
self._properties, ["configuration", "load", "sourceUris"]
)
@property
def allow_jagged_rows(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.allow_jagged_rows`.
"""
return self.configuration.allow_jagged_rows
@property
def allow_quoted_newlines(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.allow_quoted_newlines`.
"""
return self.configuration.allow_quoted_newlines
@property
def autodetect(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.autodetect`.
"""
return self.configuration.autodetect
@property
def connection_properties(self) -> List[ConnectionProperty]:
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.connection_properties`.
.. versionadded:: 3.7.0
"""
return self.configuration.connection_properties
@property
def create_disposition(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.create_disposition`.
"""
return self.configuration.create_disposition
@property
def create_session(self) -> Optional[bool]:
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.create_session`.
.. versionadded:: 3.7.0
"""
return self.configuration.create_session
@property
def encoding(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.encoding`.
"""
return self.configuration.encoding
@property
def field_delimiter(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.field_delimiter`.
"""
return self.configuration.field_delimiter
@property
def ignore_unknown_values(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.ignore_unknown_values`.
"""
return self.configuration.ignore_unknown_values
@property
def max_bad_records(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.max_bad_records`.
"""
return self.configuration.max_bad_records
@property
def null_marker(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.null_marker`.
"""
return self.configuration.null_marker
@property
def quote_character(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.quote_character`.
"""
return self.configuration.quote_character
@property
def reference_file_schema_uri(self):
"""See:
attr:`google.cloud.bigquery.job.LoadJobConfig.reference_file_schema_uri`.
"""
return self.configuration.reference_file_schema_uri
@property
def skip_leading_rows(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.skip_leading_rows`.
"""
return self.configuration.skip_leading_rows
@property
def source_format(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.source_format`.
"""
return self.configuration.source_format
@property
def write_disposition(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.write_disposition`.
"""
return self.configuration.write_disposition
@property
def schema(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.schema`.
"""
return self.configuration.schema
@property
def destination_encryption_configuration(self):
"""google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom
encryption configuration for the destination table.
Custom encryption configuration (e.g., Cloud KMS keys)
or :data:`None` if using default encryption.
See
:attr:`google.cloud.bigquery.job.LoadJobConfig.destination_encryption_configuration`.
"""
return self.configuration.destination_encryption_configuration
@property
def destination_table_description(self):
"""Optional[str] name given to destination table.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#DestinationTableProperties.FIELDS.description
"""
return self.configuration.destination_table_description
@property
def destination_table_friendly_name(self):
"""Optional[str] name given to destination table.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#DestinationTableProperties.FIELDS.friendly_name
"""
return self.configuration.destination_table_friendly_name
@property
def range_partitioning(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.range_partitioning`.
"""
return self.configuration.range_partitioning
@property
def time_partitioning(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.time_partitioning`.
"""
return self.configuration.time_partitioning
@property
def use_avro_logical_types(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.use_avro_logical_types`.
"""
return self.configuration.use_avro_logical_types
@property
def clustering_fields(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.clustering_fields`.
"""
return self.configuration.clustering_fields
@property
def schema_update_options(self):
"""See
:attr:`google.cloud.bigquery.job.LoadJobConfig.schema_update_options`.
"""
return self.configuration.schema_update_options
@property
def input_file_bytes(self):
"""Count of bytes loaded from source files.
Returns:
Optional[int]: the count (None until set from the server).
Raises:
ValueError: for invalid value types.
"""
return _helpers._int_or_none(
_helpers._get_sub_prop(
self._properties, ["statistics", "load", "inputFileBytes"]
)
)
@property
def input_files(self):
"""Count of source files.
Returns:
Optional[int]: the count (None until set from the server).
"""
return _helpers._int_or_none(
_helpers._get_sub_prop(
self._properties, ["statistics", "load", "inputFiles"]
)
)
@property
def output_bytes(self):
"""Count of bytes saved to destination table.
Returns:
Optional[int]: the count (None until set from the server).
"""
return _helpers._int_or_none(
_helpers._get_sub_prop(
self._properties, ["statistics", "load", "outputBytes"]
)
)
@property
def output_rows(self):
"""Count of rows saved to destination table.
Returns:
Optional[int]: the count (None until set from the server).
"""
return _helpers._int_or_none(
_helpers._get_sub_prop(
self._properties, ["statistics", "load", "outputRows"]
)
)
def to_api_repr(self):
"""Generate a resource for :meth:`_begin`."""
# Exclude statistics, if set.
return {
"jobReference": self._properties["jobReference"],
"configuration": self._properties["configuration"],
}
@classmethod
def from_api_repr(cls, resource: dict, client) -> "LoadJob":
"""Factory: construct a job given its API representation
.. note::
This method assumes that the project found in the resource matches
the client's project.
Args:
resource (Dict): dataset job representation returned from the API
client (google.cloud.bigquery.client.Client):
Client which holds credentials and project
configuration for the dataset.
Returns:
google.cloud.bigquery.job.LoadJob: Job parsed from ``resource``.
"""
cls._check_resource_config(resource)
job_ref = _JobReference._from_api_repr(resource["jobReference"])
job = cls(job_ref, None, None, client)
job._set_properties(resource)
return job

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,20 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from google.cloud.bigquery.magics.magics import context
# For backwards compatibility we need to make the context available in the path
# google.cloud.bigquery.magics.context
__all__ = ("context",)

View File

@@ -0,0 +1,34 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from google.cloud.bigquery.magics.line_arg_parser.exceptions import ParseError
from google.cloud.bigquery.magics.line_arg_parser.exceptions import (
DuplicateQueryParamsError,
QueryParamsParseError,
)
from google.cloud.bigquery.magics.line_arg_parser.lexer import Lexer
from google.cloud.bigquery.magics.line_arg_parser.lexer import TokenType
from google.cloud.bigquery.magics.line_arg_parser.parser import Parser
from google.cloud.bigquery.magics.line_arg_parser.visitors import QueryParamsExtractor
__all__ = (
"DuplicateQueryParamsError",
"Lexer",
"Parser",
"ParseError",
"QueryParamsExtractor",
"QueryParamsParseError",
"TokenType",
)

View File

@@ -0,0 +1,25 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class ParseError(Exception):
pass
class QueryParamsParseError(ParseError):
"""Raised when --params option is syntactically incorrect."""
class DuplicateQueryParamsError(ParseError):
pass

View File

@@ -0,0 +1,200 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import namedtuple
from collections import OrderedDict
import itertools
import re
import enum
Token = namedtuple("Token", ("type_", "lexeme", "pos"))
StateTransition = namedtuple("StateTransition", ("new_state", "total_offset"))
# Pattern matching is done with regexes, and the order in which the token patterns are
# defined is important.
#
# Suppose we had the following token definitions:
# * INT - a token matching integers,
# * FLOAT - a token matching floating point numbers,
# * DOT - a token matching a single literal dot character, i.e. "."
#
# The FLOAT token would have to be defined first, since we would want the input "1.23"
# to be tokenized as a single FLOAT token, and *not* three tokens (INT, DOT, INT).
#
# Sometimes, however, different tokens match too similar patterns, and it is not
# possible to define them in order that would avoid any ambiguity. One such case are
# the OPT_VAL and PY_NUMBER tokens, as both can match an integer literal, say "42".
#
# In order to avoid the dilemmas, the lexer implements a concept of STATES. States are
# used to split token definitions into subgroups, and in each lexer state only a single
# subgroup is used for tokenizing the input. Lexer states can therefore be though of as
# token namespaces.
#
# For example, while parsing the value of the "--params" option, we do not want to
# "recognize" it as a single OPT_VAL token, but instead want to parse it as a Python
# dictionary and verify its syntactial correctness. On the other hand, while parsing
# the value of an option other than "--params", we do not really care about its
# structure, and thus do not want to use any of the "Python tokens" for pattern matching.
#
# Token definition order is important, thus an OrderedDict is used. In addition, PEP 468
# guarantees us that the order of kwargs is preserved in Python 3.6+.
token_types = OrderedDict(
state_parse_pos_args=OrderedDict(
GOTO_PARSE_NON_PARAMS_OPTIONS=r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--))", # double dash - starting the options list
DEST_VAR=r"(?P<DEST_VAR>[^\d\W]\w*)", # essentially a Python ID
),
state_parse_non_params_options=OrderedDict(
GOTO_PARSE_PARAMS_OPTION=r"(?P<GOTO_PARSE_PARAMS_OPTION>(?=--params(?:\s|=|--|$)))", # the --params option
OPTION_SPEC=r"(?P<OPTION_SPEC>--\w+)",
OPTION_EQ=r"(?P<OPTION_EQ>=)",
OPT_VAL=r"(?P<OPT_VAL>\S+?(?=\s|--|$))",
),
state_parse_params_option=OrderedDict(
PY_STRING=r"(?P<PY_STRING>(?:{})|(?:{}))".format( # single and double quoted strings
r"'(?:[^'\\]|\.)*'", r'"(?:[^"\\]|\.)*"'
),
PARAMS_OPT_SPEC=r"(?P<PARAMS_OPT_SPEC>--params(?=\s|=|--|$))",
PARAMS_OPT_EQ=r"(?P<PARAMS_OPT_EQ>=)",
GOTO_PARSE_NON_PARAMS_OPTIONS=r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--\w+))", # found another option spec
PY_BOOL=r"(?P<PY_BOOL>True|False)",
DOLLAR_PY_ID=r"(?P<DOLLAR_PY_ID>\$[^\d\W]\w*)",
PY_NUMBER=r"(?P<PY_NUMBER>-?[1-9]\d*(?:\.\d+)?(:?[e|E][+-]?\d+)?)",
SQUOTE=r"(?P<SQUOTE>')",
DQUOTE=r'(?P<DQUOTE>")',
COLON=r"(?P<COLON>:)",
COMMA=r"(?P<COMMA>,)",
LCURL=r"(?P<LCURL>\{)",
RCURL=r"(?P<RCURL>})",
LSQUARE=r"(?P<LSQUARE>\[)",
RSQUARE=r"(?P<RSQUARE>])",
LPAREN=r"(?P<LPAREN>\()",
RPAREN=r"(?P<RPAREN>\))",
),
common=OrderedDict(
WS=r"(?P<WS>\s+)",
EOL=r"(?P<EOL>$)",
UNKNOWN=r"(?P<UNKNOWN>\S+)", # anything not a whitespace or matched by something else
),
)
class AutoStrEnum(str, enum.Enum):
"""Base enum class for for name=value str enums."""
def _generate_next_value_(name, start, count, last_values):
return name
TokenType = AutoStrEnum( # type: ignore # pytype: disable=wrong-arg-types
"TokenType",
[
(name, enum.auto())
for name in itertools.chain.from_iterable(token_types.values())
if not name.startswith("GOTO_")
],
)
class LexerState(AutoStrEnum):
PARSE_POS_ARGS = enum.auto() # parsing positional arguments
PARSE_NON_PARAMS_OPTIONS = enum.auto() # parsing options other than "--params"
PARSE_PARAMS_OPTION = enum.auto() # parsing the "--params" option
STATE_END = enum.auto()
class Lexer(object):
"""Lexical analyzer for tokenizing the cell magic input line."""
_GRAND_PATTERNS = {
LexerState.PARSE_POS_ARGS: re.compile(
"|".join(
itertools.chain(
token_types["state_parse_pos_args"].values(),
token_types["common"].values(),
)
)
),
LexerState.PARSE_NON_PARAMS_OPTIONS: re.compile(
"|".join(
itertools.chain(
token_types["state_parse_non_params_options"].values(),
token_types["common"].values(),
)
)
),
LexerState.PARSE_PARAMS_OPTION: re.compile(
"|".join(
itertools.chain(
token_types["state_parse_params_option"].values(),
token_types["common"].values(),
)
)
),
}
def __init__(self, input_text):
self._text = input_text
def __iter__(self):
# Since re.scanner does not seem to support manipulating inner scanner states,
# we need to implement lexer state transitions manually using special
# non-capturing lookahead token patterns to signal when a state transition
# should be made.
# Since we don't have "nested" states, we don't really need a stack and
# this simple mechanism is sufficient.
state = LexerState.PARSE_POS_ARGS
offset = 0 # the number of characters processed so far
while state != LexerState.STATE_END:
token_stream = self._find_state_tokens(state, offset)
for maybe_token in token_stream: # pragma: NO COVER
if isinstance(maybe_token, StateTransition):
state = maybe_token.new_state
offset = maybe_token.total_offset
break
if maybe_token.type_ != TokenType.WS:
yield maybe_token
if maybe_token.type_ == TokenType.EOL:
state = LexerState.STATE_END
break
def _find_state_tokens(self, state, current_offset):
"""Scan the input for current state's tokens starting at ``current_offset``.
Args:
state (LexerState): The current lexer state.
current_offset (int): The offset in the input text, i.e. the number
of characters already scanned so far.
Yields:
The next ``Token`` or ``StateTransition`` instance.
"""
pattern = self._GRAND_PATTERNS[state]
scanner = pattern.finditer(self._text, current_offset)
for match in scanner: # pragma: NO COVER
token_type = match.lastgroup
if token_type.startswith("GOTO_"):
yield StateTransition(
new_state=getattr(LexerState, token_type[5:]), # w/o "GOTO_" prefix
total_offset=match.start(),
)
yield Token(token_type, match.group(), match.start())

View File

@@ -0,0 +1,484 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from google.cloud.bigquery.magics.line_arg_parser import DuplicateQueryParamsError
from google.cloud.bigquery.magics.line_arg_parser import ParseError
from google.cloud.bigquery.magics.line_arg_parser import QueryParamsParseError
from google.cloud.bigquery.magics.line_arg_parser import TokenType
class ParseNode(object):
"""A base class for nodes in the input parsed to an abstract syntax tree."""
class InputLine(ParseNode):
def __init__(self, destination_var, option_list):
self.destination_var = destination_var
self.option_list = option_list
class DestinationVar(ParseNode):
def __init__(self, token):
# token type is DEST_VAR
self.token = token
self.name = token.lexeme if token is not None else None
class CmdOptionList(ParseNode):
def __init__(self, option_nodes):
self.options = [node for node in option_nodes] # shallow copy
class CmdOption(ParseNode):
def __init__(self, name, value):
self.name = name # string
self.value = value # CmdOptionValue node
class ParamsOption(CmdOption):
def __init__(self, value):
super(ParamsOption, self).__init__("params", value)
class CmdOptionValue(ParseNode):
def __init__(self, token):
# token type is OPT_VAL
self.token = token
self.value = token.lexeme
class PyVarExpansion(ParseNode):
def __init__(self, token):
self.token = token
self.raw_value = token.lexeme
class PyDict(ParseNode):
def __init__(self, dict_items):
self.items = [item for item in dict_items] # shallow copy
class PyDictItem(ParseNode):
def __init__(self, key, value):
self.key = key
self.value = value
class PyDictKey(ParseNode):
def __init__(self, token):
self.token = token
self.key_value = token.lexeme
class PyScalarValue(ParseNode):
def __init__(self, token, raw_value):
self.token = token
self.raw_value = raw_value
class PyTuple(ParseNode):
def __init__(self, tuple_items):
self.items = [item for item in tuple_items] # shallow copy
class PyList(ParseNode):
def __init__(self, list_items):
self.items = [item for item in list_items] # shallow copy
class Parser(object):
"""Parser for the tokenized cell magic input line.
The parser recognizes a simplified subset of Python grammar, specifically
a dictionary representation in typical use cases when the "--params" option
is used with the %%bigquery cell magic.
The grammar (terminal symbols are CAPITALIZED):
input_line : destination_var option_list
destination_var : DEST_VAR | EMPTY
option_list : (OPTION_SPEC [OPTION_EQ] option_value)*
(params_option | EMPTY)
(OPTION_SPEC [OPTION_EQ] option_value)*
option_value : OPT_VAL | EMPTY
# DOLLAR_PY_ID can occur if a variable passed to --params does not exist
# and is thus not expanded to a dict.
params_option : PARAMS_OPT_SPEC [PARAMS_OPT_EQ] \
(DOLLAR_PY_ID | PY_STRING | py_dict)
py_dict : LCURL dict_items RCURL
dict_items : dict_item | (dict_item COMMA dict_items)
dict_item : (dict_key COLON py_value) | EMPTY
# dict items are actually @parameter names in the cell body (i.e. the query),
# thus restricting them to strings.
dict_key : PY_STRING
py_value : PY_BOOL
| PY_NUMBER
| PY_STRING
| py_tuple
| py_list
| py_dict
py_tuple : LPAREN collection_items RPAREN
py_list : LSQUARE collection_items RSQUARE
collection_items : collection_item | (collection_item COMMA collection_items)
collection_item : py_value | EMPTY
Args:
lexer (line_arg_parser.lexer.Lexer):
An iterable producing a tokenized cell magic argument line.
"""
def __init__(self, lexer):
self._lexer = lexer
self._tokens_iter = iter(self._lexer)
self.get_next_token()
def get_next_token(self):
"""Obtain the next token from the token stream and store it as current."""
token = next(self._tokens_iter)
self._current_token = token
def consume(self, expected_type, exc_type=ParseError):
"""Move to the next token in token stream if it matches the expected type.
Args:
expected_type (lexer.TokenType): The expected token type to be consumed.
exc_type (Optional[ParseError]): The type of the exception to raise. Should be
the ``ParseError`` class or one of its subclasses. Defaults to
``ParseError``.
Raises:
ParseError: If the current token does not match the expected type.
"""
if self._current_token.type_ == expected_type:
if expected_type != TokenType.EOL:
self.get_next_token()
else:
if self._current_token.type_ == TokenType.EOL:
msg = "Unexpected end of input, expected {}.".format(expected_type)
else:
msg = "Expected token type {}, but found {} at position {}.".format(
expected_type, self._current_token.lexeme, self._current_token.pos
)
self.error(message=msg, exc_type=exc_type)
def error(self, message="Syntax error.", exc_type=ParseError):
"""Raise an error with the given message.
Args:
expected_type (lexer.TokenType): The expected token type to be consumed.
exc_type (Optional[ParseError]): The type of the exception to raise. Should be
the ``ParseError`` class or one of its subclasses. Defaults to
``ParseError``.
Raises:
ParseError: If the current token does not match the expected type.
"""
raise exc_type(message)
def input_line(self):
"""The top level method for parsing the cell magic arguments line.
Implements the following grammar production rule:
input_line : destination_var option_list
"""
dest_var = self.destination_var()
options = self.option_list()
token = self._current_token
if token.type_ != TokenType.EOL:
msg = "Unexpected input at position {}: {}".format(token.pos, token.lexeme)
self.error(msg)
return InputLine(dest_var, options)
def destination_var(self):
"""Implementation of the ``destination_var`` grammar production rule.
Production:
destination_var : DEST_VAR | EMPTY
"""
token = self._current_token
if token.type_ == TokenType.DEST_VAR:
self.consume(TokenType.DEST_VAR)
result = DestinationVar(token)
elif token.type_ == TokenType.UNKNOWN:
msg = "Unknown input at position {}: {}".format(token.pos, token.lexeme)
self.error(msg)
else:
result = DestinationVar(None)
return result
def option_list(self):
"""Implementation of the ``option_list`` grammar production rule.
Production:
option_list : (OPTION_SPEC [OPTION_EQ] option_value)*
(params_option | EMPTY)
(OPTION_SPEC [OPTION_EQ] option_value)*
"""
all_options = []
def parse_nonparams_options():
while self._current_token.type_ == TokenType.OPTION_SPEC:
token = self._current_token
self.consume(TokenType.OPTION_SPEC)
opt_name = token.lexeme[2:] # cut off the "--" prefix
# skip the optional "=" character
if self._current_token.type_ == TokenType.OPTION_EQ:
self.consume(TokenType.OPTION_EQ)
opt_value = self.option_value()
option = CmdOption(opt_name, opt_value)
all_options.append(option)
parse_nonparams_options()
token = self._current_token
if token.type_ == TokenType.PARAMS_OPT_SPEC:
option = self.params_option()
all_options.append(option)
parse_nonparams_options()
if self._current_token.type_ == TokenType.PARAMS_OPT_SPEC:
self.error(
message="Duplicate --params option", exc_type=DuplicateQueryParamsError
)
return CmdOptionList(all_options)
def option_value(self):
"""Implementation of the ``option_value`` grammar production rule.
Production:
option_value : OPT_VAL | EMPTY
"""
token = self._current_token
if token.type_ == TokenType.OPT_VAL:
self.consume(TokenType.OPT_VAL)
result = CmdOptionValue(token)
elif token.type_ == TokenType.UNKNOWN:
msg = "Unknown input at position {}: {}".format(token.pos, token.lexeme)
self.error(msg)
else:
result = None
return result
def params_option(self):
"""Implementation of the ``params_option`` grammar production rule.
Production:
params_option : PARAMS_OPT_SPEC [PARAMS_OPT_EQ] \
(DOLLAR_PY_ID | PY_STRING | py_dict)
"""
self.consume(TokenType.PARAMS_OPT_SPEC)
# skip the optional "=" character
if self._current_token.type_ == TokenType.PARAMS_OPT_EQ:
self.consume(TokenType.PARAMS_OPT_EQ)
if self._current_token.type_ == TokenType.DOLLAR_PY_ID:
token = self._current_token
self.consume(TokenType.DOLLAR_PY_ID)
opt_value = PyVarExpansion(token)
elif self._current_token.type_ == TokenType.PY_STRING:
token = self._current_token
self.consume(TokenType.PY_STRING, exc_type=QueryParamsParseError)
opt_value = PyScalarValue(token, token.lexeme)
else:
opt_value = self.py_dict()
result = ParamsOption(opt_value)
return result
def py_dict(self):
"""Implementation of the ``py_dict`` grammar production rule.
Production:
py_dict : LCURL dict_items RCURL
"""
self.consume(TokenType.LCURL, exc_type=QueryParamsParseError)
dict_items = self.dict_items()
self.consume(TokenType.RCURL, exc_type=QueryParamsParseError)
return PyDict(dict_items)
def dict_items(self):
"""Implementation of the ``dict_items`` grammar production rule.
Production:
dict_items : dict_item | (dict_item COMMA dict_items)
"""
result = []
item = self.dict_item()
if item is not None:
result.append(item)
while self._current_token.type_ == TokenType.COMMA:
self.consume(TokenType.COMMA, exc_type=QueryParamsParseError)
item = self.dict_item()
if item is not None:
result.append(item)
return result
def dict_item(self):
"""Implementation of the ``dict_item`` grammar production rule.
Production:
dict_item : (dict_key COLON py_value) | EMPTY
"""
token = self._current_token
if token.type_ == TokenType.PY_STRING:
key = self.dict_key()
self.consume(TokenType.COLON, exc_type=QueryParamsParseError)
value = self.py_value()
result = PyDictItem(key, value)
elif token.type_ == TokenType.UNKNOWN:
msg = "Unknown input at position {}: {}".format(token.pos, token.lexeme)
self.error(msg, exc_type=QueryParamsParseError)
else:
result = None
return result
def dict_key(self):
"""Implementation of the ``dict_key`` grammar production rule.
Production:
dict_key : PY_STRING
"""
token = self._current_token
self.consume(TokenType.PY_STRING, exc_type=QueryParamsParseError)
return PyDictKey(token)
def py_value(self):
"""Implementation of the ``py_value`` grammar production rule.
Production:
py_value : PY_BOOL | PY_NUMBER | PY_STRING | py_tuple | py_list | py_dict
"""
token = self._current_token
if token.type_ == TokenType.PY_BOOL:
self.consume(TokenType.PY_BOOL, exc_type=QueryParamsParseError)
return PyScalarValue(token, token.lexeme)
elif token.type_ == TokenType.PY_NUMBER:
self.consume(TokenType.PY_NUMBER, exc_type=QueryParamsParseError)
return PyScalarValue(token, token.lexeme)
elif token.type_ == TokenType.PY_STRING:
self.consume(TokenType.PY_STRING, exc_type=QueryParamsParseError)
return PyScalarValue(token, token.lexeme)
elif token.type_ == TokenType.LPAREN:
tuple_node = self.py_tuple()
return tuple_node
elif token.type_ == TokenType.LSQUARE:
list_node = self.py_list()
return list_node
elif token.type_ == TokenType.LCURL:
dict_node = self.py_dict()
return dict_node
else:
msg = "Unexpected token type {} at position {}.".format(
token.type_, token.pos
)
self.error(msg, exc_type=QueryParamsParseError)
def py_tuple(self):
"""Implementation of the ``py_tuple`` grammar production rule.
Production:
py_tuple : LPAREN collection_items RPAREN
"""
self.consume(TokenType.LPAREN, exc_type=QueryParamsParseError)
items = self.collection_items()
self.consume(TokenType.RPAREN, exc_type=QueryParamsParseError)
return PyTuple(items)
def py_list(self):
"""Implementation of the ``py_list`` grammar production rule.
Production:
py_list : LSQUARE collection_items RSQUARE
"""
self.consume(TokenType.LSQUARE, exc_type=QueryParamsParseError)
items = self.collection_items()
self.consume(TokenType.RSQUARE, exc_type=QueryParamsParseError)
return PyList(items)
def collection_items(self):
"""Implementation of the ``collection_items`` grammar production rule.
Production:
collection_items : collection_item | (collection_item COMMA collection_items)
"""
result = []
item = self.collection_item()
if item is not None:
result.append(item)
while self._current_token.type_ == TokenType.COMMA:
self.consume(TokenType.COMMA, exc_type=QueryParamsParseError)
item = self.collection_item()
if item is not None:
result.append(item)
return result
def collection_item(self):
"""Implementation of the ``collection_item`` grammar production rule.
Production:
collection_item : py_value | EMPTY
"""
if self._current_token.type_ not in {TokenType.RPAREN, TokenType.RSQUARE}:
result = self.py_value()
else:
result = None # end of list/tuple items
return result

View File

@@ -0,0 +1,159 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This module contains classes that traverse AST and convert it to something else.
If the parser successfully accepts a valid input (the bigquery cell magic arguments),
the result is an Abstract Syntax Tree (AST) that represents the input as a tree
with notes containing various useful metadata.
Node visitors can process such tree and convert it to something else that can
be used for further processing, for example:
* An optimized version of the tree with redundancy removed/simplified (not used here).
* The same tree, but with semantic errors checked, because an otherwise syntactically
valid input might still contain errors (not used here, semantic errors are detected
elsewhere).
* A form that can be directly handed to the code that operates on the input. The
``QueryParamsExtractor`` class, for instance, splits the input arguments into
the "--params <...>" part and everything else.
The "everything else" part can be then parsed by the default Jupyter argument parser,
while the --params option is processed separately by the Python evaluator.
More info on the visitor design pattern:
https://en.wikipedia.org/wiki/Visitor_pattern
"""
from __future__ import print_function
class NodeVisitor(object):
"""Base visitor class implementing the dispatch machinery."""
def visit(self, node):
method_name = "visit_{}".format(type(node).__name__)
visitor_method = getattr(self, method_name, self.method_missing)
return visitor_method(node)
def method_missing(self, node):
raise Exception("No visit_{} method".format(type(node).__name__))
class QueryParamsExtractor(NodeVisitor):
"""A visitor that extracts the "--params <...>" part from input line arguments."""
def visit_InputLine(self, node):
params_dict_parts = []
other_parts = []
dest_var_parts = self.visit(node.destination_var)
params, other_options = self.visit(node.option_list)
if dest_var_parts:
other_parts.extend(dest_var_parts)
if dest_var_parts and other_options:
other_parts.append(" ")
other_parts.extend(other_options)
params_dict_parts.extend(params)
return "".join(params_dict_parts), "".join(other_parts)
def visit_DestinationVar(self, node):
return [node.name] if node.name is not None else []
def visit_CmdOptionList(self, node):
params_opt_parts = []
other_parts = []
for i, opt in enumerate(node.options):
option_parts = self.visit(opt)
list_to_extend = params_opt_parts if opt.name == "params" else other_parts
if list_to_extend:
list_to_extend.append(" ")
list_to_extend.extend(option_parts)
return params_opt_parts, other_parts
def visit_CmdOption(self, node):
result = ["--{}".format(node.name)]
if node.value is not None:
result.append(" ")
value_parts = self.visit(node.value)
result.extend(value_parts)
return result
def visit_CmdOptionValue(self, node):
return [node.value]
def visit_ParamsOption(self, node):
value_parts = self.visit(node.value)
return value_parts
def visit_PyVarExpansion(self, node):
return [node.raw_value]
def visit_PyDict(self, node):
result = ["{"]
for i, item in enumerate(node.items):
if i > 0:
result.append(", ")
item_parts = self.visit(item)
result.extend(item_parts)
result.append("}")
return result
def visit_PyDictItem(self, node):
result = self.visit(node.key) # key parts
result.append(": ")
value_parts = self.visit(node.value)
result.extend(value_parts)
return result
def visit_PyDictKey(self, node):
return [node.key_value]
def visit_PyScalarValue(self, node):
return [node.raw_value]
def visit_PyTuple(self, node):
result = ["("]
for i, item in enumerate(node.items):
if i > 0:
result.append(", ")
item_parts = self.visit(item)
result.extend(item_parts)
result.append(")")
return result
def visit_PyList(self, node):
result = ["["]
for i, item in enumerate(node.items):
if i > 0:
result.append(", ")
item_parts = self.visit(item)
result.extend(item_parts)
result.append("]")
return result

View File

@@ -0,0 +1,776 @@
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""IPython Magics
Install ``bigquery-magics`` and call ``%load_ext bigquery_magics`` to use the
``%%bigquery`` cell magic.
See the `BigQuery Magics reference documentation
<https://googleapis.dev/python/bigquery-magics/latest/>`_.
"""
from __future__ import print_function
import re
import ast
import copy
import functools
import sys
import time
import warnings
from concurrent import futures
try:
import IPython # type: ignore
from IPython import display # type: ignore
from IPython.core import magic_arguments # type: ignore
except ImportError:
raise ImportError("This module can only be loaded in IPython.")
from google.api_core import client_info
from google.api_core import client_options
from google.api_core.exceptions import NotFound
import google.auth # type: ignore
from google.cloud import bigquery
import google.cloud.bigquery.dataset
from google.cloud.bigquery import _versions_helpers
from google.cloud.bigquery import exceptions
from google.cloud.bigquery.dbapi import _helpers
from google.cloud.bigquery.magics import line_arg_parser as lap
try:
import bigquery_magics # type: ignore
except ImportError:
bigquery_magics = None
IPYTHON_USER_AGENT = "ipython-{}".format(IPython.__version__) # type: ignore
class Context(object):
"""Storage for objects to be used throughout an IPython notebook session.
A Context object is initialized when the ``magics`` module is imported,
and can be found at ``google.cloud.bigquery.magics.context``.
"""
def __init__(self):
self._credentials = None
self._project = None
self._connection = None
self._default_query_job_config = bigquery.QueryJobConfig()
self._bigquery_client_options = client_options.ClientOptions()
self._bqstorage_client_options = client_options.ClientOptions()
self._progress_bar_type = "tqdm_notebook"
@property
def credentials(self):
"""google.auth.credentials.Credentials: Credentials to use for queries
performed through IPython magics.
Note:
These credentials do not need to be explicitly defined if you are
using Application Default Credentials. If you are not using
Application Default Credentials, manually construct a
:class:`google.auth.credentials.Credentials` object and set it as
the context credentials as demonstrated in the example below. See
`auth docs`_ for more information on obtaining credentials.
Example:
Manually setting the context credentials:
>>> from google.cloud.bigquery import magics
>>> from google.oauth2 import service_account
>>> credentials = (service_account
... .Credentials.from_service_account_file(
... '/path/to/key.json'))
>>> magics.context.credentials = credentials
.. _auth docs: http://google-auth.readthedocs.io
/en/latest/user-guide.html#obtaining-credentials
"""
if self._credentials is None:
self._credentials, _ = google.auth.default()
return self._credentials
@credentials.setter
def credentials(self, value):
self._credentials = value
@property
def project(self):
"""str: Default project to use for queries performed through IPython
magics.
Note:
The project does not need to be explicitly defined if you have an
environment default project set. If you do not have a default
project set in your environment, manually assign the project as
demonstrated in the example below.
Example:
Manually setting the context project:
>>> from google.cloud.bigquery import magics
>>> magics.context.project = 'my-project'
"""
if self._project is None:
_, self._project = google.auth.default()
return self._project
@project.setter
def project(self, value):
self._project = value
@property
def bigquery_client_options(self):
"""google.api_core.client_options.ClientOptions: client options to be
used through IPython magics.
Note::
The client options do not need to be explicitly defined if no
special network connections are required. Normally you would be
using the https://bigquery.googleapis.com/ end point.
Example:
Manually setting the endpoint:
>>> from google.cloud.bigquery import magics
>>> client_options = {}
>>> client_options['api_endpoint'] = "https://some.special.url"
>>> magics.context.bigquery_client_options = client_options
"""
return self._bigquery_client_options
@bigquery_client_options.setter
def bigquery_client_options(self, value):
self._bigquery_client_options = value
@property
def bqstorage_client_options(self):
"""google.api_core.client_options.ClientOptions: client options to be
used through IPython magics for the storage client.
Note::
The client options do not need to be explicitly defined if no
special network connections are required. Normally you would be
using the https://bigquerystorage.googleapis.com/ end point.
Example:
Manually setting the endpoint:
>>> from google.cloud.bigquery import magics
>>> client_options = {}
>>> client_options['api_endpoint'] = "https://some.special.url"
>>> magics.context.bqstorage_client_options = client_options
"""
return self._bqstorage_client_options
@bqstorage_client_options.setter
def bqstorage_client_options(self, value):
self._bqstorage_client_options = value
@property
def default_query_job_config(self):
"""google.cloud.bigquery.job.QueryJobConfig: Default job
configuration for queries.
The context's :class:`~google.cloud.bigquery.job.QueryJobConfig` is
used for queries. Some properties can be overridden with arguments to
the magics.
Example:
Manually setting the default value for ``maximum_bytes_billed``
to 100 MB:
>>> from google.cloud.bigquery import magics
>>> magics.context.default_query_job_config.maximum_bytes_billed = 100000000
"""
return self._default_query_job_config
@default_query_job_config.setter
def default_query_job_config(self, value):
self._default_query_job_config = value
@property
def progress_bar_type(self):
"""str: Default progress bar type to use to display progress bar while
executing queries through IPython magics.
Note::
Install the ``tqdm`` package to use this feature.
Example:
Manually setting the progress_bar_type:
>>> from google.cloud.bigquery import magics
>>> magics.context.progress_bar_type = "tqdm_notebook"
"""
return self._progress_bar_type
@progress_bar_type.setter
def progress_bar_type(self, value):
self._progress_bar_type = value
# If bigquery_magics is available, we load that extension rather than this one.
# Ensure google.cloud.bigquery.magics.context setters are on the correct magics
# implementation in case the user has installed the package but hasn't updated
# their code.
if bigquery_magics is not None:
context = bigquery_magics.context
else:
context = Context()
def _handle_error(error, destination_var=None):
"""Process a query execution error.
Args:
error (Exception):
An exception that occurred during the query execution.
destination_var (Optional[str]):
The name of the IPython session variable to store the query job.
"""
if destination_var:
query_job = getattr(error, "query_job", None)
if query_job is not None:
IPython.get_ipython().push({destination_var: query_job})
else:
# this is the case when previewing table rows by providing just
# table ID to cell magic
print(
"Could not save output to variable '{}'.".format(destination_var),
file=sys.stderr,
)
print("\nERROR:\n", str(error), file=sys.stderr)
def _run_query(client, query, job_config=None):
"""Runs a query while printing status updates
Args:
client (google.cloud.bigquery.client.Client):
Client to bundle configuration needed for API requests.
query (str):
SQL query to be executed. Defaults to the standard SQL dialect.
Use the ``job_config`` parameter to change dialects.
job_config (Optional[google.cloud.bigquery.job.QueryJobConfig]):
Extra configuration options for the job.
Returns:
google.cloud.bigquery.job.QueryJob: the query job created
Example:
>>> client = bigquery.Client()
>>> _run_query(client, "SELECT 17")
Executing query with job ID: bf633912-af2c-4780-b568-5d868058632b
Query executing: 1.66s
Query complete after 2.07s
'bf633912-af2c-4780-b568-5d868058632b'
"""
start_time = time.perf_counter()
query_job = client.query(query, job_config=job_config)
if job_config and job_config.dry_run:
return query_job
print(f"Executing query with job ID: {query_job.job_id}")
while True:
print(
f"\rQuery executing: {time.perf_counter() - start_time:.2f}s".format(),
end="",
)
try:
query_job.result(timeout=0.5)
break
except futures.TimeoutError:
continue
print(f"\nJob ID {query_job.job_id} successfully executed")
return query_job
def _create_dataset_if_necessary(client, dataset_id):
"""Create a dataset in the current project if it doesn't exist.
Args:
client (google.cloud.bigquery.client.Client):
Client to bundle configuration needed for API requests.
dataset_id (str):
Dataset id.
"""
dataset_reference = bigquery.dataset.DatasetReference(client.project, dataset_id)
try:
dataset = client.get_dataset(dataset_reference)
return
except NotFound:
pass
dataset = bigquery.Dataset(dataset_reference)
dataset.location = client.location
print(f"Creating dataset: {dataset_id}")
dataset = client.create_dataset(dataset)
@magic_arguments.magic_arguments()
@magic_arguments.argument(
"destination_var",
nargs="?",
help=("If provided, save the output to this variable instead of displaying it."),
)
@magic_arguments.argument(
"--destination_table",
type=str,
default=None,
help=(
"If provided, save the output of the query to a new BigQuery table. "
"Variable should be in a format <dataset_id>.<table_id>. "
"If table does not exists, it will be created. "
"If table already exists, its data will be overwritten."
),
)
@magic_arguments.argument(
"--project",
type=str,
default=None,
help=("Project to use for executing this query. Defaults to the context project."),
)
@magic_arguments.argument(
"--max_results",
default=None,
help=(
"Maximum number of rows in dataframe returned from executing the query."
"Defaults to returning all rows."
),
)
@magic_arguments.argument(
"--maximum_bytes_billed",
default=None,
help=(
"maximum_bytes_billed to use for executing this query. Defaults to "
"the context default_query_job_config.maximum_bytes_billed."
),
)
@magic_arguments.argument(
"--dry_run",
action="store_true",
default=False,
help=(
"Sets query to be a dry run to estimate costs. "
"Defaults to executing the query instead of dry run if this argument is not used."
),
)
@magic_arguments.argument(
"--use_legacy_sql",
action="store_true",
default=False,
help=(
"Sets query to use Legacy SQL instead of Standard SQL. Defaults to "
"Standard SQL if this argument is not used."
),
)
@magic_arguments.argument(
"--bigquery_api_endpoint",
type=str,
default=None,
help=(
"The desired API endpoint, e.g., bigquery.googlepis.com. Defaults to this "
"option's value in the context bigquery_client_options."
),
)
@magic_arguments.argument(
"--bqstorage_api_endpoint",
type=str,
default=None,
help=(
"The desired API endpoint, e.g., bigquerystorage.googlepis.com. Defaults to "
"this option's value in the context bqstorage_client_options."
),
)
@magic_arguments.argument(
"--no_query_cache",
action="store_true",
default=False,
help=("Do not use cached query results."),
)
@magic_arguments.argument(
"--use_bqstorage_api",
action="store_true",
default=None,
help=(
"[Deprecated] The BigQuery Storage API is already used by default to "
"download large query results, and this option has no effect. "
"If you want to switch to the classic REST API instead, use the "
"--use_rest_api option."
),
)
@magic_arguments.argument(
"--use_rest_api",
action="store_true",
default=False,
help=(
"Use the classic REST API instead of the BigQuery Storage API to "
"download query results."
),
)
@magic_arguments.argument(
"--verbose",
action="store_true",
default=False,
help=(
"If set, print verbose output, including the query job ID and the "
"amount of time for the query to finish. By default, this "
"information will be displayed as the query runs, but will be "
"cleared after the query is finished."
),
)
@magic_arguments.argument(
"--params",
nargs="+",
default=None,
help=(
"Parameters to format the query string. If present, the --params "
"flag should be followed by a string representation of a dictionary "
"in the format {'param_name': 'param_value'} (ex. {\"num\": 17}), "
"or a reference to a dictionary in the same format. The dictionary "
"reference can be made by including a '$' before the variable "
"name (ex. $my_dict_var)."
),
)
@magic_arguments.argument(
"--progress_bar_type",
type=str,
default=None,
help=(
"Sets progress bar type to display a progress bar while executing the query."
"Defaults to use tqdm_notebook. Install the ``tqdm`` package to use this feature."
),
)
@magic_arguments.argument(
"--location",
type=str,
default=None,
help=(
"Set the location to execute query."
"Defaults to location set in query setting in console."
),
)
def _cell_magic(line, query):
"""Underlying function for bigquery cell magic
Note:
This function contains the underlying logic for the 'bigquery' cell
magic. This function is not meant to be called directly.
Args:
line (str): "%%bigquery" followed by arguments as required
query (str): SQL query to run
Returns:
pandas.DataFrame: the query results.
"""
# The built-in parser does not recognize Python structures such as dicts, thus
# we extract the "--params" option and inteprpret it separately.
try:
params_option_value, rest_of_args = _split_args_line(line)
except lap.exceptions.QueryParamsParseError as exc:
rebranded_error = SyntaxError(
"--params is not a correctly formatted JSON string or a JSON "
"serializable dictionary"
)
raise rebranded_error from exc
except lap.exceptions.DuplicateQueryParamsError as exc:
rebranded_error = ValueError("Duplicate --params option.")
raise rebranded_error from exc
except lap.exceptions.ParseError as exc:
rebranded_error = ValueError(
"Unrecognized input, are option values correct? "
"Error details: {}".format(exc.args[0])
)
raise rebranded_error from exc
args = magic_arguments.parse_argstring(_cell_magic, rest_of_args)
if args.use_bqstorage_api is not None:
warnings.warn(
"Deprecated option --use_bqstorage_api, the BigQuery "
"Storage API is already used by default.",
category=DeprecationWarning,
)
use_bqstorage_api = not args.use_rest_api
location = args.location
params = []
if params_option_value:
# A non-existing params variable is not expanded and ends up in the input
# in its raw form, e.g. "$query_params".
if params_option_value.startswith("$"):
msg = 'Parameter expansion failed, undefined variable "{}".'.format(
params_option_value[1:]
)
raise NameError(msg)
params = _helpers.to_query_parameters(ast.literal_eval(params_option_value), {})
project = args.project or context.project
bigquery_client_options = copy.deepcopy(context.bigquery_client_options)
if args.bigquery_api_endpoint:
if isinstance(bigquery_client_options, dict):
bigquery_client_options["api_endpoint"] = args.bigquery_api_endpoint
else:
bigquery_client_options.api_endpoint = args.bigquery_api_endpoint
client = bigquery.Client(
project=project,
credentials=context.credentials,
default_query_job_config=context.default_query_job_config,
client_info=client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT),
client_options=bigquery_client_options,
location=location,
)
if context._connection:
client._connection = context._connection
bqstorage_client_options = copy.deepcopy(context.bqstorage_client_options)
if args.bqstorage_api_endpoint:
if isinstance(bqstorage_client_options, dict):
bqstorage_client_options["api_endpoint"] = args.bqstorage_api_endpoint
else:
bqstorage_client_options.api_endpoint = args.bqstorage_api_endpoint
bqstorage_client = _make_bqstorage_client(
client,
use_bqstorage_api,
bqstorage_client_options,
)
close_transports = functools.partial(_close_transports, client, bqstorage_client)
try:
if args.max_results:
max_results = int(args.max_results)
else:
max_results = None
query = query.strip()
if not query:
error = ValueError("Query is missing.")
_handle_error(error, args.destination_var)
return
# Check if query is given as a reference to a variable.
if query.startswith("$"):
query_var_name = query[1:]
if not query_var_name:
missing_msg = 'Missing query variable name, empty "$" is not allowed.'
raise NameError(missing_msg)
if query_var_name.isidentifier():
ip = IPython.get_ipython()
query = ip.user_ns.get(query_var_name, ip) # ip serves as a sentinel
if query is ip:
raise NameError(
f"Unknown query, variable {query_var_name} does not exist."
)
else:
if not isinstance(query, (str, bytes)):
raise TypeError(
f"Query variable {query_var_name} must be a string "
"or a bytes-like value."
)
# Any query that does not contain whitespace (aside from leading and trailing whitespace)
# is assumed to be a table id
if not re.search(r"\s", query):
try:
rows = client.list_rows(query, max_results=max_results)
except Exception as ex:
_handle_error(ex, args.destination_var)
return
result = rows.to_dataframe(
bqstorage_client=bqstorage_client,
create_bqstorage_client=False,
)
if args.destination_var:
IPython.get_ipython().push({args.destination_var: result})
return
else:
return result
job_config = bigquery.job.QueryJobConfig()
job_config.query_parameters = params
job_config.use_legacy_sql = args.use_legacy_sql
job_config.dry_run = args.dry_run
# Don't override context job config unless --no_query_cache is explicitly set.
if args.no_query_cache:
job_config.use_query_cache = False
if args.destination_table:
split = args.destination_table.split(".")
if len(split) != 2:
raise ValueError(
"--destination_table should be in a <dataset_id>.<table_id> format."
)
dataset_id, table_id = split
job_config.allow_large_results = True
dataset_ref = bigquery.dataset.DatasetReference(client.project, dataset_id)
destination_table_ref = dataset_ref.table(table_id)
job_config.destination = destination_table_ref
job_config.create_disposition = "CREATE_IF_NEEDED"
job_config.write_disposition = "WRITE_TRUNCATE"
_create_dataset_if_necessary(client, dataset_id)
if args.maximum_bytes_billed == "None":
job_config.maximum_bytes_billed = 0
elif args.maximum_bytes_billed is not None:
value = int(args.maximum_bytes_billed)
job_config.maximum_bytes_billed = value
try:
query_job = _run_query(client, query, job_config=job_config)
except Exception as ex:
_handle_error(ex, args.destination_var)
return
if not args.verbose:
display.clear_output()
if args.dry_run and args.destination_var:
IPython.get_ipython().push({args.destination_var: query_job})
return
elif args.dry_run:
print(
"Query validated. This query will process {} bytes.".format(
query_job.total_bytes_processed
)
)
return query_job
progress_bar = context.progress_bar_type or args.progress_bar_type
if max_results:
result = query_job.result(max_results=max_results).to_dataframe(
bqstorage_client=None,
create_bqstorage_client=False,
progress_bar_type=progress_bar,
)
else:
result = query_job.to_dataframe(
bqstorage_client=bqstorage_client,
create_bqstorage_client=False,
progress_bar_type=progress_bar,
)
if args.destination_var:
IPython.get_ipython().push({args.destination_var: result})
else:
return result
finally:
close_transports()
def _split_args_line(line):
"""Split out the --params option value from the input line arguments.
Args:
line (str): The line arguments passed to the cell magic.
Returns:
Tuple[str, str]
"""
lexer = lap.Lexer(line)
scanner = lap.Parser(lexer)
tree = scanner.input_line()
extractor = lap.QueryParamsExtractor()
params_option_value, rest_of_args = extractor.visit(tree)
return params_option_value, rest_of_args
def _make_bqstorage_client(client, use_bqstorage_api, client_options):
"""Creates a BigQuery Storage client.
Args:
client (:class:`~google.cloud.bigquery.client.Client`): BigQuery client.
use_bqstorage_api (bool): whether BigQuery Storage API is used or not.
client_options (:class:`google.api_core.client_options.ClientOptions`):
Custom options used with a new BigQuery Storage client instance
if one is created.
Raises:
ImportError: if google-cloud-bigquery-storage is not installed, or
grpcio package is not installed.
Returns:
None: if ``use_bqstorage_api == False``, or google-cloud-bigquery-storage
is outdated.
BigQuery Storage Client:
"""
if not use_bqstorage_api:
return None
try:
_versions_helpers.BQ_STORAGE_VERSIONS.try_import(raise_if_error=True)
except exceptions.BigQueryStorageNotFoundError as err:
customized_error = ImportError(
"The default BigQuery Storage API client cannot be used, install "
"the missing google-cloud-bigquery-storage and pyarrow packages "
"to use it. Alternatively, use the classic REST API by specifying "
"the --use_rest_api magic option."
)
raise customized_error from err
except exceptions.LegacyBigQueryStorageError:
pass
try:
from google.api_core.gapic_v1 import client_info as gapic_client_info
except ImportError as err:
customized_error = ImportError(
"Install the grpcio package to use the BigQuery Storage API."
)
raise customized_error from err
return client._ensure_bqstorage_client(
client_options=client_options,
client_info=gapic_client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT),
)
def _close_transports(client, bqstorage_client):
"""Close the given clients' underlying transport channels.
Closing the transport is needed to release system resources, namely open
sockets.
Args:
client (:class:`~google.cloud.bigquery.client.Client`):
bqstorage_client
(Optional[:class:`~google.cloud.bigquery_storage.BigQueryReadClient`]):
A client for the BigQuery Storage API.
"""
client.close()
if bqstorage_client is not None:
bqstorage_client._transport.grpc_channel.close()

View File

@@ -0,0 +1,517 @@
# -*- coding: utf-8 -*-
#
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Define resources for the BigQuery ML Models API."""
from __future__ import annotations # type: ignore
import copy
import datetime
import typing
from typing import Any, Dict, Optional, Sequence, Union
import google.cloud._helpers # type: ignore
from google.cloud.bigquery import _helpers
from google.cloud.bigquery import standard_sql
from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
class Model:
"""Model represents a machine learning model resource.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/models
Args:
model_ref:
A pointer to a model. If ``model_ref`` is a string, it must
included a project ID, dataset ID, and model ID, each separated
by ``.``.
"""
_PROPERTY_TO_API_FIELD = {
"expires": "expirationTime",
"friendly_name": "friendlyName",
# Even though it's not necessary for field mapping to map when the
# property name equals the resource name, we add these here so that we
# have an exhaustive list of all mutable properties.
"labels": "labels",
"description": "description",
"encryption_configuration": "encryptionConfiguration",
}
def __init__(self, model_ref: Union["ModelReference", str, None]):
# Use _properties on read-write properties to match the REST API
# semantics. The BigQuery API makes a distinction between an unset
# value, a null value, and a default value (0 or ""), but the protocol
# buffer classes do not.
self._properties: Dict[str, Any] = {}
if isinstance(model_ref, str):
model_ref = ModelReference.from_string(model_ref)
if model_ref:
self._properties["modelReference"] = model_ref.to_api_repr()
@property
def reference(self) -> Optional["ModelReference"]:
"""A model reference pointing to this model.
Read-only.
"""
resource = self._properties.get("modelReference")
if resource is None:
return None
else:
return ModelReference.from_api_repr(resource)
@property
def project(self) -> Optional[str]:
"""Project bound to the model."""
ref = self.reference
return ref.project if ref is not None else None
@property
def dataset_id(self) -> Optional[str]:
"""ID of dataset containing the model."""
ref = self.reference
return ref.dataset_id if ref is not None else None
@property
def model_id(self) -> Optional[str]:
"""The model ID."""
ref = self.reference
return ref.model_id if ref is not None else None
@property
def path(self) -> Optional[str]:
"""URL path for the model's APIs."""
ref = self.reference
return ref.path if ref is not None else None
@property
def location(self) -> Optional[str]:
"""The geographic location where the model resides.
This value is inherited from the dataset.
Read-only.
"""
return typing.cast(Optional[str], self._properties.get("location"))
@property
def etag(self) -> Optional[str]:
"""ETag for the model resource (:data:`None` until set from the server).
Read-only.
"""
return typing.cast(Optional[str], self._properties.get("etag"))
@property
def created(self) -> Optional[datetime.datetime]:
"""Datetime at which the model was created (:data:`None` until set from the server).
Read-only.
"""
value = typing.cast(Optional[float], self._properties.get("creationTime"))
if value is None:
return None
else:
# value will be in milliseconds.
return google.cloud._helpers._datetime_from_microseconds(
1000.0 * float(value)
)
@property
def modified(self) -> Optional[datetime.datetime]:
"""Datetime at which the model was last modified (:data:`None` until set from the server).
Read-only.
"""
value = typing.cast(Optional[float], self._properties.get("lastModifiedTime"))
if value is None:
return None
else:
# value will be in milliseconds.
return google.cloud._helpers._datetime_from_microseconds(
1000.0 * float(value)
)
@property
def model_type(self) -> str:
"""Type of the model resource.
Read-only.
"""
return typing.cast(
str, self._properties.get("modelType", "MODEL_TYPE_UNSPECIFIED")
)
@property
def training_runs(self) -> Sequence[Dict[str, Any]]:
"""Information for all training runs in increasing order of start time.
Dictionaries are in REST API format. See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun
Read-only.
"""
return typing.cast(
Sequence[Dict[str, Any]], self._properties.get("trainingRuns", [])
)
@property
def feature_columns(self) -> Sequence[standard_sql.StandardSqlField]:
"""Input feature columns that were used to train this model.
Read-only.
"""
resource: Sequence[Dict[str, Any]] = typing.cast(
Sequence[Dict[str, Any]], self._properties.get("featureColumns", [])
)
return [
standard_sql.StandardSqlField.from_api_repr(column) for column in resource
]
@property
def transform_columns(self) -> Sequence[TransformColumn]:
"""The input feature columns that were used to train this model.
The output transform columns used to train this model.
See REST API:
https://cloud.google.com/bigquery/docs/reference/rest/v2/models#transformcolumn
Read-only.
"""
resources: Sequence[Dict[str, Any]] = typing.cast(
Sequence[Dict[str, Any]], self._properties.get("transformColumns", [])
)
return [TransformColumn(resource) for resource in resources]
@property
def label_columns(self) -> Sequence[standard_sql.StandardSqlField]:
"""Label columns that were used to train this model.
The output of the model will have a ``predicted_`` prefix to these columns.
Read-only.
"""
resource: Sequence[Dict[str, Any]] = typing.cast(
Sequence[Dict[str, Any]], self._properties.get("labelColumns", [])
)
return [
standard_sql.StandardSqlField.from_api_repr(column) for column in resource
]
@property
def best_trial_id(self) -> Optional[int]:
"""The best trial_id across all training runs.
.. deprecated::
This property is deprecated!
Read-only.
"""
value = typing.cast(Optional[int], self._properties.get("bestTrialId"))
if value is not None:
value = int(value)
return value
@property
def expires(self) -> Optional[datetime.datetime]:
"""The datetime when this model expires.
If not present, the model will persist indefinitely. Expired models will be
deleted and their storage reclaimed.
"""
value = typing.cast(Optional[float], self._properties.get("expirationTime"))
if value is None:
return None
else:
# value will be in milliseconds.
return google.cloud._helpers._datetime_from_microseconds(
1000.0 * float(value)
)
@expires.setter
def expires(self, value: Optional[datetime.datetime]):
if value is None:
value_to_store: Optional[str] = None
else:
value_to_store = str(google.cloud._helpers._millis_from_datetime(value))
# TODO: Consider using typing.TypedDict when only Python 3.8+ is supported.
self._properties["expirationTime"] = value_to_store # type: ignore
@property
def description(self) -> Optional[str]:
"""Description of the model (defaults to :data:`None`)."""
return typing.cast(Optional[str], self._properties.get("description"))
@description.setter
def description(self, value: Optional[str]):
# TODO: Consider using typing.TypedDict when only Python 3.8+ is supported.
self._properties["description"] = value # type: ignore
@property
def friendly_name(self) -> Optional[str]:
"""Title of the table (defaults to :data:`None`)."""
return typing.cast(Optional[str], self._properties.get("friendlyName"))
@friendly_name.setter
def friendly_name(self, value: Optional[str]):
# TODO: Consider using typing.TypedDict when only Python 3.8+ is supported.
self._properties["friendlyName"] = value # type: ignore
@property
def labels(self) -> Dict[str, str]:
"""Labels for the table.
This method always returns a dict. To change a model's labels, modify the dict,
then call ``Client.update_model``. To delete a label, set its value to
:data:`None` before updating.
"""
return self._properties.setdefault("labels", {})
@labels.setter
def labels(self, value: Optional[Dict[str, str]]):
if value is None:
value = {}
self._properties["labels"] = value
@property
def encryption_configuration(self) -> Optional[EncryptionConfiguration]:
"""Custom encryption configuration for the model.
Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None`
if using default encryption.
See `protecting data with Cloud KMS keys
<https://cloud.google.com/bigquery/docs/customer-managed-encryption>`_
in the BigQuery documentation.
"""
prop = self._properties.get("encryptionConfiguration")
if prop:
prop = EncryptionConfiguration.from_api_repr(prop)
return typing.cast(Optional[EncryptionConfiguration], prop)
@encryption_configuration.setter
def encryption_configuration(self, value: Optional[EncryptionConfiguration]):
api_repr = value.to_api_repr() if value else value
self._properties["encryptionConfiguration"] = api_repr
@classmethod
def from_api_repr(cls, resource: Dict[str, Any]) -> "Model":
"""Factory: construct a model resource given its API representation
Args:
resource:
Model resource representation from the API
Returns:
Model parsed from ``resource``.
"""
this = cls(None)
resource = copy.deepcopy(resource)
this._properties = resource
return this
def _build_resource(self, filter_fields):
"""Generate a resource for ``update``."""
return _helpers._build_resource_from_properties(self, filter_fields)
def __repr__(self):
return f"Model(reference={self.reference!r})"
def to_api_repr(self) -> Dict[str, Any]:
"""Construct the API resource representation of this model.
Returns:
Model reference represented as an API resource
"""
return copy.deepcopy(self._properties)
class ModelReference:
"""ModelReferences are pointers to models.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/models#modelreference
"""
def __init__(self):
self._properties = {}
@property
def project(self):
"""str: Project bound to the model"""
return self._properties.get("projectId")
@property
def dataset_id(self):
"""str: ID of dataset containing the model."""
return self._properties.get("datasetId")
@property
def model_id(self):
"""str: The model ID."""
return self._properties.get("modelId")
@property
def path(self) -> str:
"""URL path for the model's APIs."""
return f"/projects/{self.project}/datasets/{self.dataset_id}/models/{self.model_id}"
@classmethod
def from_api_repr(cls, resource: Dict[str, Any]) -> "ModelReference":
"""Factory: construct a model reference given its API representation.
Args:
resource:
Model reference representation returned from the API
Returns:
Model reference parsed from ``resource``.
"""
ref = cls()
ref._properties = resource
return ref
@classmethod
def from_string(
cls, model_id: str, default_project: Optional[str] = None
) -> "ModelReference":
"""Construct a model reference from model ID string.
Args:
model_id:
A model ID in standard SQL format. If ``default_project``
is not specified, this must included a project ID, dataset
ID, and model ID, each separated by ``.``.
default_project:
The project ID to use when ``model_id`` does not include
a project ID.
Returns:
Model reference parsed from ``model_id``.
Raises:
ValueError:
If ``model_id`` is not a fully-qualified table ID in
standard SQL format.
"""
proj, dset, model = _helpers._parse_3_part_id(
model_id, default_project=default_project, property_name="model_id"
)
return cls.from_api_repr(
{"projectId": proj, "datasetId": dset, "modelId": model}
)
def to_api_repr(self) -> Dict[str, Any]:
"""Construct the API resource representation of this model reference.
Returns:
Model reference represented as an API resource.
"""
return copy.deepcopy(self._properties)
def _key(self):
"""Unique key for this model.
This is used for hashing a ModelReference.
"""
return self.project, self.dataset_id, self.model_id
def __eq__(self, other):
if not isinstance(other, ModelReference):
return NotImplemented
return self._properties == other._properties
def __ne__(self, other):
return not self == other
def __hash__(self):
return hash(self._key())
def __repr__(self):
return "ModelReference(project_id='{}', dataset_id='{}', model_id='{}')".format(
self.project, self.dataset_id, self.model_id
)
class TransformColumn:
"""TransformColumn represents a transform column feature.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/models#transformcolumn
Args:
resource:
A dictionary representing a transform column feature.
"""
def __init__(self, resource: Dict[str, Any]):
self._properties = resource
@property
def name(self) -> Optional[str]:
"""Name of the column."""
return self._properties.get("name")
@property
def type_(self) -> Optional[standard_sql.StandardSqlDataType]:
"""Data type of the column after the transform.
Returns:
Optional[google.cloud.bigquery.standard_sql.StandardSqlDataType]:
Data type of the column.
"""
type_json = self._properties.get("type")
if type_json is None:
return None
return standard_sql.StandardSqlDataType.from_api_repr(type_json)
@property
def transform_sql(self) -> Optional[str]:
"""The SQL expression used in the column transform."""
return self._properties.get("transformSql")
@classmethod
def from_api_repr(cls, resource: Dict[str, Any]) -> "TransformColumn":
"""Constructs a transform column feature given its API representation
Args:
resource:
Transform column feature representation from the API
Returns:
Transform column feature parsed from ``resource``.
"""
this = cls({})
resource = copy.deepcopy(resource)
this._properties = resource
return this
def _model_arg_to_model_ref(value, default_project=None):
"""Helper to convert a string or Model to ModelReference.
This function keeps ModelReference and other kinds of objects unchanged.
"""
if isinstance(value, str):
return ModelReference.from_string(value, default_project=default_project)
if isinstance(value, Model):
return value.reference
return value

View File

@@ -0,0 +1,164 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from contextlib import contextmanager
from google.api_core.exceptions import GoogleAPICallError # type: ignore
logger = logging.getLogger(__name__)
try:
from opentelemetry import trace # type: ignore
from opentelemetry.instrumentation.utils import http_status_to_status_code # type: ignore
from opentelemetry.trace.status import Status # type: ignore
HAS_OPENTELEMETRY = True
_warned_telemetry = True
except ImportError:
HAS_OPENTELEMETRY = False
_warned_telemetry = False
_default_attributes = {
"db.system": "BigQuery"
} # static, default values assigned to all spans
@contextmanager
def create_span(name, attributes=None, client=None, job_ref=None):
"""Creates a ContextManager for a Span to be exported to the configured exporter.
If no configuration exists yields None.
Args:
name (str): Name that will be set for the span being created
attributes (Optional[dict]):
Additional attributes that pertain to
the specific API call (i.e. not a default attribute)
client (Optional[google.cloud.bigquery.client.Client]):
Pass in a Client object to extract any attributes that may be
relevant to it and add them to the created spans.
job_ref (Optional[google.cloud.bigquery.job._AsyncJob])
Pass in a _AsyncJob object to extract any attributes that may be
relevant to it and add them to the created spans.
Yields:
opentelemetry.trace.Span: Yields the newly created Span.
Raises:
google.api_core.exceptions.GoogleAPICallError:
Raised if a span could not be yielded or issue with call to
OpenTelemetry.
"""
global _warned_telemetry
final_attributes = _get_final_span_attributes(attributes, client, job_ref)
if not HAS_OPENTELEMETRY:
if not _warned_telemetry:
logger.debug(
"This service is instrumented using OpenTelemetry. "
"OpenTelemetry or one of its components could not be imported; "
"please add compatible versions of opentelemetry-api and "
"opentelemetry-instrumentation packages in order to get BigQuery "
"Tracing data."
)
_warned_telemetry = True
yield None
return
tracer = trace.get_tracer(__name__)
# yield new span value
with tracer.start_as_current_span(name=name, attributes=final_attributes) as span:
try:
yield span
except GoogleAPICallError as error:
if error.code is not None:
span.set_status(Status(http_status_to_status_code(error.code)))
raise
def _get_final_span_attributes(attributes=None, client=None, job_ref=None):
"""Compiles attributes from: client, job_ref, user-provided attributes.
Attributes from all of these sources are merged together. Note the
attributes are added sequentially based on perceived order of precedence:
i.e. attributes added last may overwrite attributes added earlier.
Args:
attributes (Optional[dict]):
Additional attributes that pertain to
the specific API call (i.e. not a default attribute)
client (Optional[google.cloud.bigquery.client.Client]):
Pass in a Client object to extract any attributes that may be
relevant to it and add them to the final_attributes
job_ref (Optional[google.cloud.bigquery.job._AsyncJob])
Pass in a _AsyncJob object to extract any attributes that may be
relevant to it and add them to the final_attributes.
Returns: dict
"""
collected_attributes = _default_attributes.copy()
if client:
collected_attributes.update(_set_client_attributes(client))
if job_ref:
collected_attributes.update(_set_job_attributes(job_ref))
if attributes:
collected_attributes.update(attributes)
final_attributes = {k: v for k, v in collected_attributes.items() if v is not None}
return final_attributes
def _set_client_attributes(client):
return {"db.name": client.project, "location": client.location}
def _set_job_attributes(job_ref):
job_attributes = {
"db.name": job_ref.project,
"job_id": job_ref.job_id,
"state": job_ref.state,
}
job_attributes["hasErrors"] = job_ref.error_result is not None
if job_ref.created is not None:
job_attributes["timeCreated"] = job_ref.created.isoformat()
if job_ref.started is not None:
job_attributes["timeStarted"] = job_ref.started.isoformat()
if job_ref.ended is not None:
job_attributes["timeEnded"] = job_ref.ended.isoformat()
if job_ref.location is not None:
job_attributes["location"] = job_ref.location
if job_ref.parent_job_id is not None:
job_attributes["parent_job_id"] = job_ref.parent_job_id
if job_ref.num_child_jobs is not None:
job_attributes["num_child_jobs"] = job_ref.num_child_jobs
total_bytes_billed = getattr(job_ref, "total_bytes_billed", None)
if total_bytes_billed is not None:
job_attributes["total_bytes_billed"] = total_bytes_billed
total_bytes_processed = getattr(job_ref, "total_bytes_processed", None)
if total_bytes_processed is not None:
job_attributes["total_bytes_processed"] = total_bytes_processed
return job_attributes

View File

@@ -0,0 +1,2 @@
# Marker file for PEP 561.
# The google-cloud-bigquery package uses inline types.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,207 @@
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from google.api_core import exceptions
from google.api_core import retry
import google.api_core.future.polling
from google.auth import exceptions as auth_exceptions # type: ignore
import requests.exceptions
_RETRYABLE_REASONS = frozenset(
["rateLimitExceeded", "backendError", "internalError", "badGateway"]
)
_UNSTRUCTURED_RETRYABLE_TYPES = (
ConnectionError,
exceptions.TooManyRequests,
exceptions.InternalServerError,
exceptions.BadGateway,
exceptions.ServiceUnavailable,
requests.exceptions.ChunkedEncodingError,
requests.exceptions.ConnectionError,
requests.exceptions.Timeout,
auth_exceptions.TransportError,
)
_DEFAULT_RETRY_DEADLINE = 10.0 * 60.0 # 10 minutes
# Ambiguous errors (e.g. internalError, backendError, rateLimitExceeded) retry
# until the full `_DEFAULT_RETRY_DEADLINE`. This is because the
# `jobs.getQueryResults` REST API translates a job failure into an HTTP error.
#
# TODO(https://github.com/googleapis/python-bigquery/issues/1903): Investigate
# if we can fail early for ambiguous errors in `QueryJob.result()`'s call to
# the `jobs.getQueryResult` API.
#
# We need `_DEFAULT_JOB_DEADLINE` to be some multiple of
# `_DEFAULT_RETRY_DEADLINE` to allow for a few retries after the retry
# timeout is reached.
#
# Note: This multiple should actually be a multiple of
# (2 * _DEFAULT_RETRY_DEADLINE). After an ambiguous exception, the first
# call from `job_retry()` refreshes the job state without actually restarting
# the query. The second `job_retry()` actually restarts the query. For a more
# detailed explanation, see the comments where we set `restart_query_job = True`
# in `QueryJob.result()`'s inner `is_job_done()` function.
_DEFAULT_JOB_DEADLINE = 2.0 * (2.0 * _DEFAULT_RETRY_DEADLINE)
def _should_retry(exc):
"""Predicate for determining when to retry.
We retry if and only if the 'reason' is 'backendError'
or 'rateLimitExceeded'.
"""
if not hasattr(exc, "errors") or len(exc.errors) == 0:
# Check for unstructured error returns, e.g. from GFE
return isinstance(exc, _UNSTRUCTURED_RETRYABLE_TYPES)
reason = exc.errors[0]["reason"]
return reason in _RETRYABLE_REASONS
DEFAULT_RETRY = retry.Retry(predicate=_should_retry, deadline=_DEFAULT_RETRY_DEADLINE)
"""The default retry object.
Any method with a ``retry`` parameter will be retried automatically,
with reasonable defaults. To disable retry, pass ``retry=None``.
To modify the default retry behavior, call a ``with_XXX`` method
on ``DEFAULT_RETRY``. For example, to change the deadline to 30 seconds,
pass ``retry=bigquery.DEFAULT_RETRY.with_deadline(30)``.
"""
def _should_retry_get_job_conflict(exc):
"""Predicate for determining when to retry a jobs.get call after a conflict error.
Sometimes we get a 404 after a Conflict. In this case, we
have pretty high confidence that by retrying the 404, we'll
(hopefully) eventually recover the job.
https://github.com/googleapis/python-bigquery/issues/2134
Note: we may be able to extend this to user-specified predicates
after https://github.com/googleapis/python-api-core/issues/796
to tweak existing Retry object predicates.
"""
return isinstance(exc, exceptions.NotFound) or _should_retry(exc)
# Pick a deadline smaller than our other deadlines since we want to timeout
# before those expire.
_DEFAULT_GET_JOB_CONFLICT_DEADLINE = _DEFAULT_RETRY_DEADLINE / 3.0
_DEFAULT_GET_JOB_CONFLICT_RETRY = retry.Retry(
predicate=_should_retry_get_job_conflict,
deadline=_DEFAULT_GET_JOB_CONFLICT_DEADLINE,
)
"""Private, may be removed in future."""
# Note: Take care when updating DEFAULT_TIMEOUT to anything but None. We
# briefly had a default timeout, but even setting it at more than twice the
# theoretical server-side default timeout of 2 minutes was not enough for
# complex queries. See:
# https://github.com/googleapis/python-bigquery/issues/970#issuecomment-921934647
DEFAULT_TIMEOUT = None
"""The default API timeout.
This is the time to wait per request. To adjust the total wait time, set a
deadline on the retry object.
"""
job_retry_reasons = (
"rateLimitExceeded",
"backendError",
"internalError",
"jobRateLimitExceeded",
)
def _job_should_retry(exc):
# Sometimes we have ambiguous errors, such as 'backendError' which could
# be due to an API problem or a job problem. For these, make sure we retry
# our is_job_done() function.
#
# Note: This won't restart the job unless we know for sure it's because of
# the job status and set restart_query_job = True in that loop. This means
# that we might end up calling this predicate twice for the same job
# but from different paths: (1) from jobs.getQueryResults RetryError and
# (2) from translating the job error from the body of a jobs.get response.
#
# Note: If we start retrying job types other than queries where we don't
# call the problematic getQueryResults API to check the status, we need
# to provide a different predicate, as there shouldn't be ambiguous
# errors in those cases.
if isinstance(exc, exceptions.RetryError):
exc = exc.cause
# Per https://github.com/googleapis/python-bigquery/issues/1929, sometimes
# retriable errors make their way here. Because of the separate
# `restart_query_job` logic to make sure we aren't restarting non-failed
# jobs, it should be safe to continue and not totally fail our attempt at
# waiting for the query to complete.
if _should_retry(exc):
return True
if not hasattr(exc, "errors") or len(exc.errors) == 0:
return False
reason = exc.errors[0]["reason"]
return reason in job_retry_reasons
DEFAULT_JOB_RETRY = retry.Retry(
predicate=_job_should_retry, deadline=_DEFAULT_JOB_DEADLINE
)
"""
The default job retry object.
"""
def _query_job_insert_should_retry(exc):
# Per https://github.com/googleapis/python-bigquery/issues/2134, sometimes
# we get a 404 error. In this case, if we get this far, assume that the job
# doesn't actually exist and try again. We can't add 404 to the default
# job_retry because that happens for errors like "this table does not
# exist", which probably won't resolve with a retry.
if isinstance(exc, exceptions.RetryError):
exc = exc.cause
if isinstance(exc, exceptions.NotFound):
message = exc.message
# Don't try to retry table/dataset not found, just job not found.
# The URL contains jobs, so use whitespace to disambiguate.
return message is not None and " job" in message.lower()
return _job_should_retry(exc)
_DEFAULT_QUERY_JOB_INSERT_RETRY = retry.Retry(
predicate=_query_job_insert_should_retry,
# jobs.insert doesn't wait for the job to complete, so we don't need the
# long _DEFAULT_JOB_DEADLINE for this part.
deadline=_DEFAULT_RETRY_DEADLINE,
)
"""Private, may be removed in future."""
DEFAULT_GET_JOB_TIMEOUT = 128
"""
Default timeout for Client.get_job().
"""
POLLING_DEFAULT_VALUE = google.api_core.future.polling.PollingFuture._DEFAULT_VALUE
"""
Default value defined in google.api_core.future.polling.PollingFuture.
"""

View File

@@ -0,0 +1,33 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""User-Defined Routines."""
from google.cloud.bigquery.enums import DeterminismLevel
from google.cloud.bigquery.routine.routine import Routine
from google.cloud.bigquery.routine.routine import RoutineArgument
from google.cloud.bigquery.routine.routine import RoutineReference
from google.cloud.bigquery.routine.routine import RoutineType
from google.cloud.bigquery.routine.routine import RemoteFunctionOptions
__all__ = (
"DeterminismLevel",
"Routine",
"RoutineArgument",
"RoutineReference",
"RoutineType",
"RemoteFunctionOptions",
)

View File

@@ -0,0 +1,744 @@
# -*- coding: utf-8 -*-
#
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Define resources for the BigQuery Routines API."""
from typing import Any, Dict, Optional, Union
import google.cloud._helpers # type: ignore
from google.cloud.bigquery import _helpers
from google.cloud.bigquery.standard_sql import StandardSqlDataType
from google.cloud.bigquery.standard_sql import StandardSqlTableType
class RoutineType:
"""The fine-grained type of the routine.
https://cloud.google.com/bigquery/docs/reference/rest/v2/routines#routinetype
.. versionadded:: 2.22.0
"""
ROUTINE_TYPE_UNSPECIFIED = "ROUTINE_TYPE_UNSPECIFIED"
SCALAR_FUNCTION = "SCALAR_FUNCTION"
PROCEDURE = "PROCEDURE"
TABLE_VALUED_FUNCTION = "TABLE_VALUED_FUNCTION"
class Routine(object):
"""Resource representing a user-defined routine.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/routines
Args:
routine_ref (Union[str, google.cloud.bigquery.routine.RoutineReference]):
A pointer to a routine. If ``routine_ref`` is a string, it must
included a project ID, dataset ID, and routine ID, each separated
by ``.``.
``**kwargs`` (Dict):
Initial property values.
"""
_PROPERTY_TO_API_FIELD = {
"arguments": "arguments",
"body": "definitionBody",
"created": "creationTime",
"etag": "etag",
"imported_libraries": "importedLibraries",
"language": "language",
"modified": "lastModifiedTime",
"reference": "routineReference",
"return_type": "returnType",
"return_table_type": "returnTableType",
"type_": "routineType",
"description": "description",
"determinism_level": "determinismLevel",
"remote_function_options": "remoteFunctionOptions",
"data_governance_type": "dataGovernanceType",
}
def __init__(self, routine_ref, **kwargs) -> None:
if isinstance(routine_ref, str):
routine_ref = RoutineReference.from_string(routine_ref)
self._properties = {"routineReference": routine_ref.to_api_repr()}
for property_name in kwargs:
setattr(self, property_name, kwargs[property_name])
@property
def reference(self):
"""google.cloud.bigquery.routine.RoutineReference: Reference
describing the ID of this routine.
"""
return RoutineReference.from_api_repr(
self._properties[self._PROPERTY_TO_API_FIELD["reference"]]
)
@property
def path(self):
"""str: URL path for the routine's APIs."""
return self.reference.path
@property
def project(self):
"""str: ID of the project containing the routine."""
return self.reference.project
@property
def dataset_id(self):
"""str: ID of dataset containing the routine."""
return self.reference.dataset_id
@property
def routine_id(self):
"""str: The routine ID."""
return self.reference.routine_id
@property
def etag(self):
"""str: ETag for the resource (:data:`None` until set from the
server).
Read-only.
"""
return self._properties.get(self._PROPERTY_TO_API_FIELD["etag"])
@property
def type_(self):
"""str: The fine-grained type of the routine.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/routines#RoutineType
"""
return self._properties.get(self._PROPERTY_TO_API_FIELD["type_"])
@type_.setter
def type_(self, value):
self._properties[self._PROPERTY_TO_API_FIELD["type_"]] = value
@property
def created(self):
"""Optional[datetime.datetime]: Datetime at which the routine was
created (:data:`None` until set from the server).
Read-only.
"""
value = self._properties.get(self._PROPERTY_TO_API_FIELD["created"])
if value is not None and value != 0:
# value will be in milliseconds.
return google.cloud._helpers._datetime_from_microseconds(
1000.0 * float(value)
)
@property
def modified(self):
"""Optional[datetime.datetime]: Datetime at which the routine was
last modified (:data:`None` until set from the server).
Read-only.
"""
value = self._properties.get(self._PROPERTY_TO_API_FIELD["modified"])
if value is not None and value != 0:
# value will be in milliseconds.
return google.cloud._helpers._datetime_from_microseconds(
1000.0 * float(value)
)
@property
def language(self):
"""Optional[str]: The language of the routine.
Defaults to ``SQL``.
"""
return self._properties.get(self._PROPERTY_TO_API_FIELD["language"])
@language.setter
def language(self, value):
self._properties[self._PROPERTY_TO_API_FIELD["language"]] = value
@property
def arguments(self):
"""List[google.cloud.bigquery.routine.RoutineArgument]: Input/output
argument of a function or a stored procedure.
In-place modification is not supported. To set, replace the entire
property value with the modified list of
:class:`~google.cloud.bigquery.routine.RoutineArgument` objects.
"""
resources = self._properties.get(self._PROPERTY_TO_API_FIELD["arguments"], [])
return [RoutineArgument.from_api_repr(resource) for resource in resources]
@arguments.setter
def arguments(self, value):
if not value:
resource = []
else:
resource = [argument.to_api_repr() for argument in value]
self._properties[self._PROPERTY_TO_API_FIELD["arguments"]] = resource
@property
def return_type(self):
"""google.cloud.bigquery.StandardSqlDataType: Return type of
the routine.
If absent, the return type is inferred from
:attr:`~google.cloud.bigquery.routine.Routine.body` at query time in
each query that references this routine. If present, then the
evaluated result will be cast to the specified returned type at query
time.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/routines#Routine.FIELDS.return_type
"""
resource = self._properties.get(self._PROPERTY_TO_API_FIELD["return_type"])
if not resource:
return resource
return StandardSqlDataType.from_api_repr(resource)
@return_type.setter
def return_type(self, value: StandardSqlDataType):
resource = None if not value else value.to_api_repr()
self._properties[self._PROPERTY_TO_API_FIELD["return_type"]] = resource
@property
def return_table_type(self) -> Union[StandardSqlTableType, Any, None]:
"""The return type of a Table Valued Function (TVF) routine.
.. versionadded:: 2.22.0
"""
resource = self._properties.get(
self._PROPERTY_TO_API_FIELD["return_table_type"]
)
if not resource:
return resource
return StandardSqlTableType.from_api_repr(resource)
@return_table_type.setter
def return_table_type(self, value: Optional[StandardSqlTableType]):
if not value:
resource = None
else:
resource = value.to_api_repr()
self._properties[self._PROPERTY_TO_API_FIELD["return_table_type"]] = resource
@property
def imported_libraries(self):
"""List[str]: The path of the imported JavaScript libraries.
The :attr:`~google.cloud.bigquery.routine.Routine.language` must
equal ``JAVACRIPT``.
Examples:
Set the ``imported_libraries`` to a list of Google Cloud Storage
URIs.
.. code-block:: python
routine = bigquery.Routine("proj.dataset.routine_id")
routine.imported_libraries = [
"gs://cloud-samples-data/bigquery/udfs/max-value.js",
]
"""
return self._properties.get(
self._PROPERTY_TO_API_FIELD["imported_libraries"], []
)
@imported_libraries.setter
def imported_libraries(self, value):
if not value:
resource = []
else:
resource = value
self._properties[self._PROPERTY_TO_API_FIELD["imported_libraries"]] = resource
@property
def body(self):
"""str: The body of the routine."""
return self._properties.get(self._PROPERTY_TO_API_FIELD["body"])
@body.setter
def body(self, value):
self._properties[self._PROPERTY_TO_API_FIELD["body"]] = value
@property
def description(self):
"""Optional[str]: Description of the routine (defaults to
:data:`None`).
"""
return self._properties.get(self._PROPERTY_TO_API_FIELD["description"])
@description.setter
def description(self, value):
self._properties[self._PROPERTY_TO_API_FIELD["description"]] = value
@property
def determinism_level(self):
"""Optional[str]: (experimental) The determinism level of the JavaScript UDF
if defined.
"""
return self._properties.get(self._PROPERTY_TO_API_FIELD["determinism_level"])
@determinism_level.setter
def determinism_level(self, value):
self._properties[self._PROPERTY_TO_API_FIELD["determinism_level"]] = value
@property
def remote_function_options(self):
"""Optional[google.cloud.bigquery.routine.RemoteFunctionOptions]:
Configures remote function options for a routine.
Raises:
ValueError:
If the value is not
:class:`~google.cloud.bigquery.routine.RemoteFunctionOptions` or
:data:`None`.
"""
prop = self._properties.get(
self._PROPERTY_TO_API_FIELD["remote_function_options"]
)
if prop is not None:
return RemoteFunctionOptions.from_api_repr(prop)
@remote_function_options.setter
def remote_function_options(self, value):
api_repr = value
if isinstance(value, RemoteFunctionOptions):
api_repr = value.to_api_repr()
elif value is not None:
raise ValueError(
"value must be google.cloud.bigquery.routine.RemoteFunctionOptions "
"or None"
)
self._properties[
self._PROPERTY_TO_API_FIELD["remote_function_options"]
] = api_repr
@property
def data_governance_type(self):
"""Optional[str]: If set to ``DATA_MASKING``, the function is validated
and made available as a masking function.
Raises:
ValueError:
If the value is not :data:`string` or :data:`None`.
"""
return self._properties.get(self._PROPERTY_TO_API_FIELD["data_governance_type"])
@data_governance_type.setter
def data_governance_type(self, value):
if value is not None and not isinstance(value, str):
raise ValueError(
"invalid data_governance_type, must be a string or `None`."
)
self._properties[self._PROPERTY_TO_API_FIELD["data_governance_type"]] = value
@classmethod
def from_api_repr(cls, resource: dict) -> "Routine":
"""Factory: construct a routine given its API representation.
Args:
resource (Dict[str, object]):
Resource, as returned from the API.
Returns:
google.cloud.bigquery.routine.Routine:
Python object, as parsed from ``resource``.
"""
ref = cls(RoutineReference.from_api_repr(resource["routineReference"]))
ref._properties = resource
return ref
def to_api_repr(self) -> dict:
"""Construct the API resource representation of this routine.
Returns:
Dict[str, object]: Routine represented as an API resource.
"""
return self._properties
def _build_resource(self, filter_fields):
"""Generate a resource for ``update``."""
return _helpers._build_resource_from_properties(self, filter_fields)
def __repr__(self):
return "Routine('{}.{}.{}')".format(
self.project, self.dataset_id, self.routine_id
)
class RoutineArgument(object):
"""Input/output argument of a function or a stored procedure.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/routines#argument
Args:
``**kwargs`` (Dict):
Initial property values.
"""
_PROPERTY_TO_API_FIELD = {
"data_type": "dataType",
"kind": "argumentKind",
# Even though it's not necessary for field mapping to map when the
# property name equals the resource name, we add these here so that we
# have an exhaustive list of all properties.
"name": "name",
"mode": "mode",
}
def __init__(self, **kwargs) -> None:
self._properties: Dict[str, Any] = {}
for property_name in kwargs:
setattr(self, property_name, kwargs[property_name])
@property
def name(self):
"""Optional[str]: Name of this argument.
Can be absent for function return argument.
"""
return self._properties.get(self._PROPERTY_TO_API_FIELD["name"])
@name.setter
def name(self, value):
self._properties[self._PROPERTY_TO_API_FIELD["name"]] = value
@property
def kind(self):
"""Optional[str]: The kind of argument, for example ``FIXED_TYPE`` or
``ANY_TYPE``.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/routines#Argument.FIELDS.argument_kind
"""
return self._properties.get(self._PROPERTY_TO_API_FIELD["kind"])
@kind.setter
def kind(self, value):
self._properties[self._PROPERTY_TO_API_FIELD["kind"]] = value
@property
def mode(self):
"""Optional[str]: The input/output mode of the argument."""
return self._properties.get(self._PROPERTY_TO_API_FIELD["mode"])
@mode.setter
def mode(self, value):
self._properties[self._PROPERTY_TO_API_FIELD["mode"]] = value
@property
def data_type(self):
"""Optional[google.cloud.bigquery.StandardSqlDataType]: Type
of a variable, e.g., a function argument.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/routines#Argument.FIELDS.data_type
"""
resource = self._properties.get(self._PROPERTY_TO_API_FIELD["data_type"])
if not resource:
return resource
return StandardSqlDataType.from_api_repr(resource)
@data_type.setter
def data_type(self, value):
if value:
resource = value.to_api_repr()
else:
resource = None
self._properties[self._PROPERTY_TO_API_FIELD["data_type"]] = resource
@classmethod
def from_api_repr(cls, resource: dict) -> "RoutineArgument":
"""Factory: construct a routine argument given its API representation.
Args:
resource (Dict[str, object]): Resource, as returned from the API.
Returns:
google.cloud.bigquery.routine.RoutineArgument:
Python object, as parsed from ``resource``.
"""
ref = cls()
ref._properties = resource
return ref
def to_api_repr(self) -> dict:
"""Construct the API resource representation of this routine argument.
Returns:
Dict[str, object]: Routine argument represented as an API resource.
"""
return self._properties
def __eq__(self, other):
if not isinstance(other, RoutineArgument):
return NotImplemented
return self._properties == other._properties
def __ne__(self, other):
return not self == other
def __repr__(self):
all_properties = [
"{}={}".format(property_name, repr(getattr(self, property_name)))
for property_name in sorted(self._PROPERTY_TO_API_FIELD)
]
return "RoutineArgument({})".format(", ".join(all_properties))
class RoutineReference(object):
"""A pointer to a routine.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/routines#routinereference
"""
def __init__(self):
self._properties = {}
@property
def project(self):
"""str: ID of the project containing the routine."""
# TODO: The typehinting for this needs work. Setting this pragma to temporarily
# manage a pytype issue that came up in another PR. See Issue: #2132
return self._properties["projectId"] # pytype: disable=typed-dict-error
@property
def dataset_id(self):
"""str: ID of dataset containing the routine."""
# TODO: The typehinting for this needs work. Setting this pragma to temporarily
# manage a pytype issue that came up in another PR. See Issue: #2132
return self._properties["datasetId"] # pytype: disable=typed-dict-error
@property
def routine_id(self):
"""str: The routine ID."""
# TODO: The typehinting for this needs work. Setting this pragma to temporarily
# manage a pytype issue that came up in another PR. See Issue: #2132
return self._properties["routineId"] # pytype: disable=typed-dict-error
@property
def path(self):
"""str: URL path for the routine's APIs."""
return "/projects/%s/datasets/%s/routines/%s" % (
self.project,
self.dataset_id,
self.routine_id,
)
@classmethod
def from_api_repr(cls, resource: dict) -> "RoutineReference":
"""Factory: construct a routine reference given its API representation.
Args:
resource (Dict[str, object]):
Routine reference representation returned from the API.
Returns:
google.cloud.bigquery.routine.RoutineReference:
Routine reference parsed from ``resource``.
"""
ref = cls()
ref._properties = resource
return ref
@classmethod
def from_string(
cls, routine_id: str, default_project: Optional[str] = None
) -> "RoutineReference":
"""Factory: construct a routine reference from routine ID string.
Args:
routine_id (str):
A routine ID in standard SQL format. If ``default_project``
is not specified, this must included a project ID, dataset
ID, and routine ID, each separated by ``.``.
default_project (Optional[str]):
The project ID to use when ``routine_id`` does not
include a project ID.
Returns:
google.cloud.bigquery.routine.RoutineReference:
Routine reference parsed from ``routine_id``.
Raises:
ValueError:
If ``routine_id`` is not a fully-qualified routine ID in
standard SQL format.
"""
proj, dset, routine = _helpers._parse_3_part_id(
routine_id, default_project=default_project, property_name="routine_id"
)
return cls.from_api_repr(
{"projectId": proj, "datasetId": dset, "routineId": routine}
)
def to_api_repr(self) -> dict:
"""Construct the API resource representation of this routine reference.
Returns:
Dict[str, object]: Routine reference represented as an API resource.
"""
return self._properties
def __eq__(self, other):
"""Two RoutineReferences are equal if they point to the same routine."""
if not isinstance(other, RoutineReference):
return NotImplemented
return str(self) == str(other)
def __hash__(self):
return hash(str(self))
def __ne__(self, other):
return not self == other
def __repr__(self):
return "RoutineReference.from_string('{}')".format(str(self))
def __str__(self):
"""String representation of the reference.
This is a fully-qualified ID, including the project ID and dataset ID.
"""
return "{}.{}.{}".format(self.project, self.dataset_id, self.routine_id)
class RemoteFunctionOptions(object):
"""Configuration options for controlling remote BigQuery functions."""
_PROPERTY_TO_API_FIELD = {
"endpoint": "endpoint",
"connection": "connection",
"max_batching_rows": "maxBatchingRows",
"user_defined_context": "userDefinedContext",
}
def __init__(
self,
endpoint=None,
connection=None,
max_batching_rows=None,
user_defined_context=None,
_properties=None,
) -> None:
if _properties is None:
_properties = {}
self._properties = _properties
if endpoint is not None:
self.endpoint = endpoint
if connection is not None:
self.connection = connection
if max_batching_rows is not None:
self.max_batching_rows = max_batching_rows
if user_defined_context is not None:
self.user_defined_context = user_defined_context
@property
def connection(self):
"""string: Fully qualified name of the user-provided connection object which holds the authentication information to send requests to the remote service.
Format is "projects/{projectId}/locations/{locationId}/connections/{connectionId}"
"""
return _helpers._str_or_none(self._properties.get("connection"))
@connection.setter
def connection(self, value):
self._properties["connection"] = _helpers._str_or_none(value)
@property
def endpoint(self):
"""string: Endpoint of the user-provided remote service
Example: "https://us-east1-my_gcf_project.cloudfunctions.net/remote_add"
"""
return _helpers._str_or_none(self._properties.get("endpoint"))
@endpoint.setter
def endpoint(self, value):
self._properties["endpoint"] = _helpers._str_or_none(value)
@property
def max_batching_rows(self):
"""int64: Max number of rows in each batch sent to the remote service.
If absent or if 0, BigQuery dynamically decides the number of rows in a batch.
"""
return _helpers._int_or_none(self._properties.get("maxBatchingRows"))
@max_batching_rows.setter
def max_batching_rows(self, value):
self._properties["maxBatchingRows"] = _helpers._str_or_none(value)
@property
def user_defined_context(self):
"""Dict[str, str]: User-defined context as a set of key/value pairs,
which will be sent as function invocation context together with
batched arguments in the requests to the remote service. The total
number of bytes of keys and values must be less than 8KB.
"""
return self._properties.get("userDefinedContext")
@user_defined_context.setter
def user_defined_context(self, value):
if not isinstance(value, dict):
raise ValueError("value must be dictionary")
self._properties["userDefinedContext"] = value
@classmethod
def from_api_repr(cls, resource: dict) -> "RemoteFunctionOptions":
"""Factory: construct remote function options given its API representation.
Args:
resource (Dict[str, object]): Resource, as returned from the API.
Returns:
google.cloud.bigquery.routine.RemoteFunctionOptions:
Python object, as parsed from ``resource``.
"""
ref = cls()
ref._properties = resource
return ref
def to_api_repr(self) -> dict:
"""Construct the API resource representation of this RemoteFunctionOptions.
Returns:
Dict[str, object]: Remote function options represented as an API resource.
"""
return self._properties
def __eq__(self, other):
if not isinstance(other, RemoteFunctionOptions):
return NotImplemented
return self._properties == other._properties
def __ne__(self, other):
return not self == other
def __repr__(self):
all_properties = [
"{}={}".format(property_name, repr(getattr(self, property_name)))
for property_name in sorted(self._PROPERTY_TO_API_FIELD)
]
return "RemoteFunctionOptions({})".format(", ".join(all_properties))

View File

@@ -0,0 +1,896 @@
# Copyright 2015 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Schemas for BigQuery tables / queries."""
from __future__ import annotations
import enum
import typing
from typing import Any, cast, Dict, Iterable, Optional, Union, Sequence
from google.cloud.bigquery import _helpers
from google.cloud.bigquery import standard_sql
from google.cloud.bigquery import enums
from google.cloud.bigquery.enums import StandardSqlTypeNames
_STRUCT_TYPES = ("RECORD", "STRUCT")
# SQL types reference:
# LEGACY SQL: https://cloud.google.com/bigquery/data-types#legacy_sql_data_types
# GoogleSQL: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
LEGACY_TO_STANDARD_TYPES = {
"STRING": StandardSqlTypeNames.STRING,
"BYTES": StandardSqlTypeNames.BYTES,
"INTEGER": StandardSqlTypeNames.INT64,
"INT64": StandardSqlTypeNames.INT64,
"FLOAT": StandardSqlTypeNames.FLOAT64,
"FLOAT64": StandardSqlTypeNames.FLOAT64,
"NUMERIC": StandardSqlTypeNames.NUMERIC,
"BIGNUMERIC": StandardSqlTypeNames.BIGNUMERIC,
"BOOLEAN": StandardSqlTypeNames.BOOL,
"BOOL": StandardSqlTypeNames.BOOL,
"GEOGRAPHY": StandardSqlTypeNames.GEOGRAPHY,
"RECORD": StandardSqlTypeNames.STRUCT,
"STRUCT": StandardSqlTypeNames.STRUCT,
"TIMESTAMP": StandardSqlTypeNames.TIMESTAMP,
"DATE": StandardSqlTypeNames.DATE,
"TIME": StandardSqlTypeNames.TIME,
"DATETIME": StandardSqlTypeNames.DATETIME,
"FOREIGN": StandardSqlTypeNames.FOREIGN,
# no direct conversion from ARRAY, the latter is represented by mode="REPEATED"
}
"""String names of the legacy SQL types to integer codes of Standard SQL standard_sql."""
class _DefaultSentinel(enum.Enum):
"""Object used as 'sentinel' indicating default value should be used.
Uses enum so that pytype/mypy knows that this is the only possible value.
https://stackoverflow.com/a/60605919/101923
Literal[_DEFAULT_VALUE] is an alternative, but only added in Python 3.8.
https://docs.python.org/3/library/typing.html#typing.Literal
"""
DEFAULT_VALUE = object()
_DEFAULT_VALUE = _DefaultSentinel.DEFAULT_VALUE
class FieldElementType(object):
"""Represents the type of a field element.
Args:
element_type (str): The type of a field element.
"""
def __init__(self, element_type: str):
self._properties = {}
self._properties["type"] = element_type.upper()
@property
def element_type(self):
return self._properties.get("type")
@classmethod
def from_api_repr(cls, api_repr: Optional[dict]) -> Optional["FieldElementType"]:
"""Factory: construct a FieldElementType given its API representation.
Args:
api_repr (Dict[str, str]): field element type as returned from
the API.
Returns:
google.cloud.bigquery.FieldElementType:
Python object, as parsed from ``api_repr``.
"""
if not api_repr:
return None
return cls(api_repr["type"].upper())
def to_api_repr(self) -> dict:
"""Construct the API resource representation of this field element type.
Returns:
Dict[str, str]: Field element type represented as an API resource.
"""
return self._properties
class SchemaField(object):
"""Describe a single field within a table schema.
Args:
name: The name of the field.
field_type:
The type of the field. See
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.type
mode:
Defaults to ``'NULLABLE'``. The mode of the field. See
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.mode
description: Description for the field.
fields: Subfields (requires ``field_type`` of 'RECORD').
policy_tags: The policy tag list for the field.
precision:
Precison (number of digits) of fields with NUMERIC or BIGNUMERIC type.
scale:
Scale (digits after decimal) of fields with NUMERIC or BIGNUMERIC type.
max_length: Maximum length of fields with STRING or BYTES type.
default_value_expression: str, Optional
Used to specify the default value of a field using a SQL expression. It can only be set for
top level fields (columns).
You can use a struct or array expression to specify default value for the entire struct or
array. The valid SQL expressions are:
- Literals for all data types, including STRUCT and ARRAY.
- The following functions:
`CURRENT_TIMESTAMP`
`CURRENT_TIME`
`CURRENT_DATE`
`CURRENT_DATETIME`
`GENERATE_UUID`
`RAND`
`SESSION_USER`
`ST_GEOPOINT`
- Struct or array composed with the above allowed functions, for example:
"[CURRENT_DATE(), DATE '2020-01-01'"]
range_element_type: FieldElementType, str, Optional
The subtype of the RANGE, if the type of this field is RANGE. If
the type is RANGE, this field is required. Possible values for the
field element type of a RANGE include `DATE`, `DATETIME` and
`TIMESTAMP`.
rounding_mode: Union[enums.RoundingMode, str, None]
Specifies the rounding mode to be used when storing values of
NUMERIC and BIGNUMERIC type.
Unspecified will default to using ROUND_HALF_AWAY_FROM_ZERO.
ROUND_HALF_AWAY_FROM_ZERO rounds half values away from zero
when applying precision and scale upon writing of NUMERIC and BIGNUMERIC
values.
For Scale: 0
1.1, 1.2, 1.3, 1.4 => 1
1.5, 1.6, 1.7, 1.8, 1.9 => 2
ROUND_HALF_EVEN rounds half values to the nearest even value
when applying precision and scale upon writing of NUMERIC and BIGNUMERIC
values.
For Scale: 0
1.1, 1.2, 1.3, 1.4 => 1
1.5 => 2
1.6, 1.7, 1.8, 1.9 => 2
2.5 => 2
foreign_type_definition: Optional[str]
Definition of the foreign data type.
Only valid for top-level schema fields (not nested fields).
If the type is FOREIGN, this field is required.
"""
def __init__(
self,
name: str,
field_type: str,
mode: str = "NULLABLE",
default_value_expression: Optional[str] = None,
description: Union[str, _DefaultSentinel] = _DEFAULT_VALUE,
fields: Iterable["SchemaField"] = (),
policy_tags: Union["PolicyTagList", None, _DefaultSentinel] = _DEFAULT_VALUE,
precision: Union[int, _DefaultSentinel] = _DEFAULT_VALUE,
scale: Union[int, _DefaultSentinel] = _DEFAULT_VALUE,
max_length: Union[int, _DefaultSentinel] = _DEFAULT_VALUE,
range_element_type: Union[FieldElementType, str, None] = None,
rounding_mode: Union[enums.RoundingMode, str, None] = None,
foreign_type_definition: Optional[str] = None,
):
self._properties: Dict[str, Any] = {
"name": name,
"type": field_type,
}
self._properties["name"] = name
if mode is not None:
self._properties["mode"] = mode.upper()
if description is not _DEFAULT_VALUE:
self._properties["description"] = description
if default_value_expression is not None:
self._properties["defaultValueExpression"] = default_value_expression
if precision is not _DEFAULT_VALUE:
self._properties["precision"] = precision
if scale is not _DEFAULT_VALUE:
self._properties["scale"] = scale
if max_length is not _DEFAULT_VALUE:
self._properties["maxLength"] = max_length
if policy_tags is not _DEFAULT_VALUE:
# TODO: The typehinting for this needs work. Setting this pragma to temporarily
# manage a pytype issue that came up in another PR. See Issue: #2132
self._properties["policyTags"] = (
policy_tags.to_api_repr() # pytype: disable=attribute-error
if policy_tags is not None
else None
)
if isinstance(range_element_type, str):
self._properties["rangeElementType"] = {"type": range_element_type}
if isinstance(range_element_type, FieldElementType):
self._properties["rangeElementType"] = range_element_type.to_api_repr()
if rounding_mode is not None:
self._properties["roundingMode"] = rounding_mode
if foreign_type_definition is not None:
self._properties["foreignTypeDefinition"] = foreign_type_definition
if fields: # Don't set the property if it's not set.
self._properties["fields"] = [field.to_api_repr() for field in fields]
@classmethod
def from_api_repr(cls, api_repr: dict) -> "SchemaField":
"""Return a ``SchemaField`` object deserialized from a dictionary.
Args:
api_repr (Mapping[str, str]): The serialized representation
of the SchemaField, such as what is output by
:meth:`to_api_repr`.
Returns:
google.cloud.bigquery.schema.SchemaField: The ``SchemaField`` object.
"""
placeholder = cls("this_will_be_replaced", "PLACEHOLDER")
# Note: we don't make a copy of api_repr because this can cause
# unnecessary slowdowns, especially on deeply nested STRUCT / RECORD
# fields. See https://github.com/googleapis/python-bigquery/issues/6
placeholder._properties = api_repr
# Add the field `mode` with default value if it does not exist. Fixes
# an incompatibility issue with pandas-gbq:
# https://github.com/googleapis/python-bigquery-pandas/issues/854
if "mode" not in placeholder._properties:
placeholder._properties["mode"] = "NULLABLE"
return placeholder
@property
def name(self):
"""str: The name of the field."""
return self._properties.get("name", "")
@property
def field_type(self):
"""str: The type of the field.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.type
"""
type_ = self._properties.get("type")
if type_ is None: # Shouldn't happen, but some unit tests do this.
return None
return cast(str, type_).upper()
@property
def mode(self):
"""Optional[str]: The mode of the field.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.mode
"""
return cast(str, self._properties.get("mode", "NULLABLE")).upper()
@property
def is_nullable(self):
"""bool: whether 'mode' is 'nullable'."""
return self.mode == "NULLABLE"
@property
def default_value_expression(self):
"""Optional[str] default value of a field, using an SQL expression"""
return self._properties.get("defaultValueExpression")
@property
def description(self):
"""Optional[str]: description for the field."""
return self._properties.get("description")
@property
def precision(self):
"""Optional[int]: Precision (number of digits) for the NUMERIC field."""
return _helpers._int_or_none(self._properties.get("precision"))
@property
def scale(self):
"""Optional[int]: Scale (digits after decimal) for the NUMERIC field."""
return _helpers._int_or_none(self._properties.get("scale"))
@property
def max_length(self):
"""Optional[int]: Maximum length for the STRING or BYTES field."""
return _helpers._int_or_none(self._properties.get("maxLength"))
@property
def range_element_type(self):
"""Optional[FieldElementType]: The subtype of the RANGE, if the
type of this field is RANGE.
Must be set when ``type`` is `"RANGE"`. Must be one of `"DATE"`,
`"DATETIME"` or `"TIMESTAMP"`.
"""
if self._properties.get("rangeElementType"):
ret = self._properties.get("rangeElementType")
return FieldElementType.from_api_repr(ret)
@property
def rounding_mode(self):
"""Enum that specifies the rounding mode to be used when storing values of
NUMERIC and BIGNUMERIC type.
"""
return self._properties.get("roundingMode")
@property
def foreign_type_definition(self):
"""Definition of the foreign data type.
Only valid for top-level schema fields (not nested fields).
If the type is FOREIGN, this field is required.
"""
return self._properties.get("foreignTypeDefinition")
@property
def fields(self):
"""Optional[tuple]: Subfields contained in this field.
Must be empty unset if ``field_type`` is not 'RECORD'.
"""
return tuple(_to_schema_fields(self._properties.get("fields", [])))
@property
def policy_tags(self):
"""Optional[google.cloud.bigquery.schema.PolicyTagList]: Policy tag list
definition for this field.
"""
resource = self._properties.get("policyTags")
return PolicyTagList.from_api_repr(resource) if resource is not None else None
def to_api_repr(self) -> dict:
"""Return a dictionary representing this schema field.
Returns:
Dict: A dictionary representing the SchemaField in a serialized form.
"""
# Note: we don't make a copy of _properties because this can cause
# unnecessary slowdowns, especially on deeply nested STRUCT / RECORD
# fields. See https://github.com/googleapis/python-bigquery/issues/6
return self._properties
def _key(self):
"""A tuple key that uniquely describes this field.
Used to compute this instance's hashcode and evaluate equality.
Returns:
Tuple: The contents of this :class:`~google.cloud.bigquery.schema.SchemaField`.
"""
field_type = self.field_type.upper() if self.field_type is not None else None
# Type can temporarily be set to None if the code needs a SchemaField instance,
# but has not determined the exact type of the field yet.
if field_type is not None:
if field_type == "STRING" or field_type == "BYTES":
if self.max_length is not None:
field_type = f"{field_type}({self.max_length})"
elif field_type.endswith("NUMERIC"):
if self.precision is not None:
if self.scale is not None:
field_type = f"{field_type}({self.precision}, {self.scale})"
else:
field_type = f"{field_type}({self.precision})"
policy_tags = (
None if self.policy_tags is None else tuple(sorted(self.policy_tags.names))
)
return (
self.name,
field_type,
# Mode is always str, if not given it defaults to a str value
self.mode.upper(), # pytype: disable=attribute-error
self.default_value_expression,
self.description,
self.fields,
policy_tags,
)
def to_standard_sql(self) -> standard_sql.StandardSqlField:
"""Return the field as the standard SQL field representation object."""
sql_type = standard_sql.StandardSqlDataType()
if self.mode == "REPEATED":
sql_type.type_kind = StandardSqlTypeNames.ARRAY
else:
sql_type.type_kind = LEGACY_TO_STANDARD_TYPES.get(
self.field_type,
StandardSqlTypeNames.TYPE_KIND_UNSPECIFIED,
)
if sql_type.type_kind == StandardSqlTypeNames.ARRAY: # noqa: E721
array_element_type = LEGACY_TO_STANDARD_TYPES.get(
self.field_type,
StandardSqlTypeNames.TYPE_KIND_UNSPECIFIED,
)
sql_type.array_element_type = standard_sql.StandardSqlDataType(
type_kind=array_element_type
)
# ARRAY cannot directly contain other arrays, only scalar types and STRUCTs
# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#array-type
if array_element_type == StandardSqlTypeNames.STRUCT: # noqa: E721
sql_type.array_element_type.struct_type = (
standard_sql.StandardSqlStructType(
fields=(field.to_standard_sql() for field in self.fields)
)
)
elif sql_type.type_kind == StandardSqlTypeNames.STRUCT: # noqa: E721
sql_type.struct_type = standard_sql.StandardSqlStructType(
fields=(field.to_standard_sql() for field in self.fields)
)
return standard_sql.StandardSqlField(name=self.name, type=sql_type)
def __eq__(self, other):
if not isinstance(other, SchemaField):
return NotImplemented
return self._key() == other._key()
def __ne__(self, other):
return not self == other
def __hash__(self):
return hash(self._key())
def __repr__(self):
key = self._key()
policy_tags = key[-1]
policy_tags_inst = None if policy_tags is None else PolicyTagList(policy_tags)
adjusted_key = key[:-1] + (policy_tags_inst,)
return f"{self.__class__.__name__}{adjusted_key}"
def _parse_schema_resource(info):
"""Parse a resource fragment into a schema field.
Args:
info: (Mapping[str, Dict]): should contain a "fields" key to be parsed
Returns:
Optional[Sequence[google.cloud.bigquery.schema.SchemaField`]:
A list of parsed fields, or ``None`` if no "fields" key found.
"""
if isinstance(info, list):
return [SchemaField.from_api_repr(f) for f in info]
return [SchemaField.from_api_repr(f) for f in info.get("fields", ())]
def _build_schema_resource(fields):
"""Generate a resource fragment for a schema.
Args:
fields (Sequence[google.cloud.bigquery.schema.SchemaField): schema to be dumped.
Returns:
Sequence[Dict]: Mappings describing the schema of the supplied fields.
"""
if isinstance(fields, Sequence):
# Input is a Sequence (e.g. a list): Process and return a list of SchemaFields
return [field.to_api_repr() for field in fields]
else:
raise TypeError("Schema must be a Sequence (e.g. a list) or None.")
def _to_schema_fields(schema):
"""Coerces schema to a list of SchemaField instances while
preserving the original structure as much as possible.
Args:
schema (Sequence[Union[ \
:class:`~google.cloud.bigquery.schema.SchemaField`, \
Mapping[str, Any] \
]
]
)::
Table schema to convert. Can be a list of SchemaField
objects or mappings.
Returns:
A list of SchemaField objects.
Raises:
TypeError: If schema is not a Sequence.
"""
if isinstance(schema, Sequence):
# Input is a Sequence (e.g. a list): Process and return a list of SchemaFields
return [
field
if isinstance(field, SchemaField)
else SchemaField.from_api_repr(field)
for field in schema
]
else:
raise TypeError("Schema must be a Sequence (e.g. a list) or None.")
class PolicyTagList(object):
"""Define Policy Tags for a column.
Args:
names (
Optional[Tuple[str]]): list of policy tags to associate with
the column. Policy tag identifiers are of the form
`projects/*/locations/*/taxonomies/*/policyTags/*`.
"""
def __init__(self, names: Iterable[str] = ()):
self._properties = {}
self._properties["names"] = tuple(names)
@property
def names(self):
"""Tuple[str]: Policy tags associated with this definition."""
return self._properties.get("names", ())
def _key(self):
"""A tuple key that uniquely describes this PolicyTagList.
Used to compute this instance's hashcode and evaluate equality.
Returns:
Tuple: The contents of this :class:`~google.cloud.bigquery.schema.PolicyTagList`.
"""
return tuple(sorted(self._properties.get("names", ())))
def __eq__(self, other):
if not isinstance(other, PolicyTagList):
return NotImplemented
return self._key() == other._key()
def __ne__(self, other):
return not self == other
def __hash__(self):
return hash(self._key())
def __repr__(self):
return f"{self.__class__.__name__}(names={self._key()})"
@classmethod
def from_api_repr(cls, api_repr: dict) -> "PolicyTagList":
"""Return a :class:`PolicyTagList` object deserialized from a dict.
This method creates a new ``PolicyTagList`` instance that points to
the ``api_repr`` parameter as its internal properties dict. This means
that when a ``PolicyTagList`` instance is stored as a property of
another object, any changes made at the higher level will also appear
here.
Args:
api_repr (Mapping[str, str]):
The serialized representation of the PolicyTagList, such as
what is output by :meth:`to_api_repr`.
Returns:
Optional[google.cloud.bigquery.schema.PolicyTagList]:
The ``PolicyTagList`` object or None.
"""
if api_repr is None:
return None
names = api_repr.get("names", ())
return cls(names=names)
def to_api_repr(self) -> dict:
"""Return a dictionary representing this object.
This method returns the properties dict of the ``PolicyTagList``
instance rather than making a copy. This means that when a
``PolicyTagList`` instance is stored as a property of another
object, any changes made at the higher level will also appear here.
Returns:
dict:
A dictionary representing the PolicyTagList object in
serialized form.
"""
answer = {"names": list(self.names)}
return answer
class ForeignTypeInfo:
"""Metadata about the foreign data type definition such as the system in which the
type is defined.
Args:
type_system (str): Required. Specifies the system which defines the
foreign data type.
TypeSystem enum currently includes:
* "TYPE_SYSTEM_UNSPECIFIED"
* "HIVE"
"""
def __init__(self, type_system: Optional[str] = None):
self._properties: Dict[str, Any] = {}
self.type_system = type_system
@property
def type_system(self) -> Optional[str]:
"""Required. Specifies the system which defines the foreign data
type."""
return self._properties.get("typeSystem")
@type_system.setter
def type_system(self, value: Optional[str]):
value = _helpers._isinstance_or_raise(value, str, none_allowed=True)
self._properties["typeSystem"] = value
def to_api_repr(self) -> dict:
"""Build an API representation of this object.
Returns:
Dict[str, Any]:
A dictionary in the format used by the BigQuery API.
"""
return self._properties
@classmethod
def from_api_repr(cls, api_repr: Dict[str, Any]) -> "ForeignTypeInfo":
"""Factory: constructs an instance of the class (cls)
given its API representation.
Args:
api_repr (Dict[str, Any]):
API representation of the object to be instantiated.
Returns:
An instance of the class initialized with data from 'api_repr'.
"""
config = cls()
config._properties = api_repr
return config
class SerDeInfo:
"""Serializer and deserializer information.
Args:
serialization_library (str): Required. Specifies a fully-qualified class
name of the serialization library that is responsible for the
translation of data between table representation and the underlying
low-level input and output format structures. The maximum length is
256 characters.
name (Optional[str]): Name of the SerDe. The maximum length is 256
characters.
parameters: (Optional[dict[str, str]]): Key-value pairs that define the initialization
parameters for the serialization library. Maximum size 10 Kib.
"""
def __init__(
self,
serialization_library: str,
name: Optional[str] = None,
parameters: Optional[dict[str, str]] = None,
):
self._properties: Dict[str, Any] = {}
self.serialization_library = serialization_library
self.name = name
self.parameters = parameters
@property
def serialization_library(self) -> str:
"""Required. Specifies a fully-qualified class name of the serialization
library that is responsible for the translation of data between table
representation and the underlying low-level input and output format
structures. The maximum length is 256 characters."""
return typing.cast(str, self._properties.get("serializationLibrary"))
@serialization_library.setter
def serialization_library(self, value: str):
value = _helpers._isinstance_or_raise(value, str, none_allowed=False)
self._properties["serializationLibrary"] = value
@property
def name(self) -> Optional[str]:
"""Optional. Name of the SerDe. The maximum length is 256 characters."""
return self._properties.get("name")
@name.setter
def name(self, value: Optional[str] = None):
value = _helpers._isinstance_or_raise(value, str, none_allowed=True)
self._properties["name"] = value
@property
def parameters(self) -> Optional[dict[str, str]]:
"""Optional. Key-value pairs that define the initialization parameters
for the serialization library. Maximum size 10 Kib."""
return self._properties.get("parameters")
@parameters.setter
def parameters(self, value: Optional[dict[str, str]] = None):
value = _helpers._isinstance_or_raise(value, dict, none_allowed=True)
self._properties["parameters"] = value
def to_api_repr(self) -> dict:
"""Build an API representation of this object.
Returns:
Dict[str, Any]:
A dictionary in the format used by the BigQuery API.
"""
return self._properties
@classmethod
def from_api_repr(cls, api_repr: dict) -> SerDeInfo:
"""Factory: constructs an instance of the class (cls)
given its API representation.
Args:
api_repr (Dict[str, Any]):
API representation of the object to be instantiated.
Returns:
An instance of the class initialized with data from 'api_repr'.
"""
config = cls("PLACEHOLDER")
config._properties = api_repr
return config
class StorageDescriptor:
"""Contains information about how a table's data is stored and accessed by open
source query engines.
Args:
input_format (Optional[str]): Specifies the fully qualified class name of
the InputFormat (e.g.
"org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"). The maximum
length is 128 characters.
location_uri (Optional[str]): The physical location of the table (e.g.
'gs://spark-dataproc-data/pangea-data/case_sensitive/' or
'gs://spark-dataproc-data/pangea-data/'). The maximum length is
2056 bytes.
output_format (Optional[str]): Specifies the fully qualified class name
of the OutputFormat (e.g.
"org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"). The maximum
length is 128 characters.
serde_info (Union[SerDeInfo, dict, None]): Serializer and deserializer information.
"""
def __init__(
self,
input_format: Optional[str] = None,
location_uri: Optional[str] = None,
output_format: Optional[str] = None,
serde_info: Union[SerDeInfo, dict, None] = None,
):
self._properties: Dict[str, Any] = {}
self.input_format = input_format
self.location_uri = location_uri
self.output_format = output_format
# Using typing.cast() because mypy cannot wrap it's head around the fact that:
# the setter can accept Union[SerDeInfo, dict, None]
# but the getter will only ever return Optional[SerDeInfo].
self.serde_info = typing.cast(Optional[SerDeInfo], serde_info)
@property
def input_format(self) -> Optional[str]:
"""Optional. Specifies the fully qualified class name of the InputFormat
(e.g. "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"). The maximum
length is 128 characters."""
return self._properties.get("inputFormat")
@input_format.setter
def input_format(self, value: Optional[str]):
value = _helpers._isinstance_or_raise(value, str, none_allowed=True)
self._properties["inputFormat"] = value
@property
def location_uri(self) -> Optional[str]:
"""Optional. The physical location of the table (e.g. 'gs://spark-
dataproc-data/pangea-data/case_sensitive/' or 'gs://spark-dataproc-
data/pangea-data/'). The maximum length is 2056 bytes."""
return self._properties.get("locationUri")
@location_uri.setter
def location_uri(self, value: Optional[str]):
value = _helpers._isinstance_or_raise(value, str, none_allowed=True)
self._properties["locationUri"] = value
@property
def output_format(self) -> Optional[str]:
"""Optional. Specifies the fully qualified class name of the
OutputFormat (e.g. "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat").
The maximum length is 128 characters."""
return self._properties.get("outputFormat")
@output_format.setter
def output_format(self, value: Optional[str]):
value = _helpers._isinstance_or_raise(value, str, none_allowed=True)
self._properties["outputFormat"] = value
@property
def serde_info(self) -> Optional[SerDeInfo]:
"""Optional. Serializer and deserializer information."""
prop = _helpers._get_sub_prop(self._properties, ["serDeInfo"])
if prop is not None:
return typing.cast(SerDeInfo, SerDeInfo.from_api_repr(prop))
return None
@serde_info.setter
def serde_info(self, value: Union[SerDeInfo, dict, None]):
value = _helpers._isinstance_or_raise(
value, (SerDeInfo, dict), none_allowed=True
)
if isinstance(value, SerDeInfo):
self._properties["serDeInfo"] = value.to_api_repr()
else:
self._properties["serDeInfo"] = value
def to_api_repr(self) -> dict:
"""Build an API representation of this object.
Returns:
Dict[str, Any]:
A dictionary in the format used by the BigQuery API.
"""
return self._properties
@classmethod
def from_api_repr(cls, resource: dict) -> StorageDescriptor:
"""Factory: constructs an instance of the class (cls)
given its API representation.
Args:
resource (Dict[str, Any]):
API representation of the object to be instantiated.
Returns:
An instance of the class initialized with data from 'resource'.
"""
config = cls()
config._properties = resource
return config

View File

@@ -0,0 +1,389 @@
# Copyright 2021 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import typing
from typing import Any, Dict, Iterable, List, Optional
from google.cloud.bigquery.enums import StandardSqlTypeNames
class StandardSqlDataType:
"""The type of a variable, e.g., a function argument.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/StandardSqlDataType
Examples:
.. code-block:: text
INT64: {type_kind="INT64"}
ARRAY: {type_kind="ARRAY", array_element_type="STRING"}
STRUCT<x STRING, y ARRAY>: {
type_kind="STRUCT",
struct_type={
fields=[
{name="x", type={type_kind="STRING"}},
{
name="y",
type={type_kind="ARRAY", array_element_type="DATE"}
}
]
}
}
RANGE: {type_kind="RANGE", range_element_type="DATETIME"}
Args:
type_kind:
The top level type of this field. Can be any standard SQL data type,
e.g. INT64, DATE, ARRAY.
array_element_type:
The type of the array's elements, if type_kind is ARRAY.
struct_type:
The fields of this struct, in order, if type_kind is STRUCT.
range_element_type:
The type of the range's elements, if type_kind is RANGE.
"""
def __init__(
self,
type_kind: Optional[
StandardSqlTypeNames
] = StandardSqlTypeNames.TYPE_KIND_UNSPECIFIED,
array_element_type: Optional["StandardSqlDataType"] = None,
struct_type: Optional["StandardSqlStructType"] = None,
range_element_type: Optional["StandardSqlDataType"] = None,
):
self._properties: Dict[str, Any] = {}
self.type_kind = type_kind
self.array_element_type = array_element_type
self.struct_type = struct_type
self.range_element_type = range_element_type
@property
def type_kind(self) -> Optional[StandardSqlTypeNames]:
"""The top level type of this field.
Can be any standard SQL data type, e.g. INT64, DATE, ARRAY.
"""
kind = self._properties["typeKind"]
return StandardSqlTypeNames[kind] # pytype: disable=missing-parameter
@type_kind.setter
def type_kind(self, value: Optional[StandardSqlTypeNames]):
if not value:
kind = StandardSqlTypeNames.TYPE_KIND_UNSPECIFIED.value
else:
kind = value.value
self._properties["typeKind"] = kind
@property
def array_element_type(self) -> Optional["StandardSqlDataType"]:
"""The type of the array's elements, if type_kind is ARRAY."""
element_type = self._properties.get("arrayElementType")
if element_type is None:
return None
result = StandardSqlDataType()
result._properties = element_type # We do not use a copy on purpose.
return result
@array_element_type.setter
def array_element_type(self, value: Optional["StandardSqlDataType"]):
element_type = None if value is None else value.to_api_repr()
if element_type is None:
self._properties.pop("arrayElementType", None)
else:
self._properties["arrayElementType"] = element_type
@property
def struct_type(self) -> Optional["StandardSqlStructType"]:
"""The fields of this struct, in order, if type_kind is STRUCT."""
struct_info = self._properties.get("structType")
if struct_info is None:
return None
result = StandardSqlStructType()
result._properties = struct_info # We do not use a copy on purpose.
return result
@struct_type.setter
def struct_type(self, value: Optional["StandardSqlStructType"]):
struct_type = None if value is None else value.to_api_repr()
if struct_type is None:
self._properties.pop("structType", None)
else:
self._properties["structType"] = struct_type
@property
def range_element_type(self) -> Optional["StandardSqlDataType"]:
"""The type of the range's elements, if type_kind = "RANGE". Must be
one of DATETIME, DATE, or TIMESTAMP."""
range_element_info = self._properties.get("rangeElementType")
if range_element_info is None:
return None
result = StandardSqlDataType()
result._properties = range_element_info # We do not use a copy on purpose.
return result
@range_element_type.setter
def range_element_type(self, value: Optional["StandardSqlDataType"]):
range_element_type = None if value is None else value.to_api_repr()
if range_element_type is None:
self._properties.pop("rangeElementType", None)
else:
self._properties["rangeElementType"] = range_element_type
def to_api_repr(self) -> Dict[str, Any]:
"""Construct the API resource representation of this SQL data type."""
return copy.deepcopy(self._properties)
@classmethod
def from_api_repr(cls, resource: Dict[str, Any]):
"""Construct an SQL data type instance given its API representation."""
type_kind = resource.get("typeKind")
if type_kind not in StandardSqlTypeNames.__members__:
type_kind = StandardSqlTypeNames.TYPE_KIND_UNSPECIFIED
else:
# Convert string to an enum member.
type_kind = StandardSqlTypeNames[ # pytype: disable=missing-parameter
typing.cast(str, type_kind)
]
array_element_type = None
if type_kind == StandardSqlTypeNames.ARRAY:
element_type = resource.get("arrayElementType")
if element_type:
array_element_type = cls.from_api_repr(element_type)
struct_type = None
if type_kind == StandardSqlTypeNames.STRUCT:
struct_info = resource.get("structType")
if struct_info:
struct_type = StandardSqlStructType.from_api_repr(struct_info)
range_element_type = None
if type_kind == StandardSqlTypeNames.RANGE:
range_element_info = resource.get("rangeElementType")
if range_element_info:
range_element_type = cls.from_api_repr(range_element_info)
return cls(type_kind, array_element_type, struct_type, range_element_type)
def __eq__(self, other):
if not isinstance(other, StandardSqlDataType):
return NotImplemented
else:
return (
self.type_kind == other.type_kind
and self.array_element_type == other.array_element_type
and self.struct_type == other.struct_type
and self.range_element_type == other.range_element_type
)
def __str__(self):
result = f"{self.__class__.__name__}(type_kind={self.type_kind!r}, ...)"
return result
class StandardSqlField:
"""A field or a column.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/StandardSqlField
Args:
name:
The name of this field. Can be absent for struct fields.
type:
The type of this parameter. Absent if not explicitly specified.
For example, CREATE FUNCTION statement can omit the return type; in this
case the output parameter does not have this "type" field).
"""
def __init__(
self, name: Optional[str] = None, type: Optional[StandardSqlDataType] = None
):
type_repr = None if type is None else type.to_api_repr()
self._properties = {"name": name, "type": type_repr}
@property
def name(self) -> Optional[str]:
"""The name of this field. Can be absent for struct fields."""
return typing.cast(Optional[str], self._properties["name"])
@name.setter
def name(self, value: Optional[str]):
self._properties["name"] = value
@property
def type(self) -> Optional[StandardSqlDataType]:
"""The type of this parameter. Absent if not explicitly specified.
For example, CREATE FUNCTION statement can omit the return type; in this
case the output parameter does not have this "type" field).
"""
type_info = self._properties["type"]
if type_info is None:
return None
result = StandardSqlDataType()
# We do not use a properties copy on purpose.
result._properties = typing.cast(Dict[str, Any], type_info)
return result
@type.setter
def type(self, value: Optional[StandardSqlDataType]):
value_repr = None if value is None else value.to_api_repr()
self._properties["type"] = value_repr
def to_api_repr(self) -> Dict[str, Any]:
"""Construct the API resource representation of this SQL field."""
return copy.deepcopy(self._properties)
@classmethod
def from_api_repr(cls, resource: Dict[str, Any]):
"""Construct an SQL field instance given its API representation."""
result = cls(
name=resource.get("name"),
type=StandardSqlDataType.from_api_repr(resource.get("type", {})),
)
return result
def __eq__(self, other):
if not isinstance(other, StandardSqlField):
return NotImplemented
else:
return self.name == other.name and self.type == other.type
class StandardSqlStructType:
"""Type of a struct field.
See:
https://cloud.google.com/bigquery/docs/reference/rest/v2/StandardSqlDataType#StandardSqlStructType
Args:
fields: The fields in this struct.
"""
def __init__(self, fields: Optional[Iterable[StandardSqlField]] = None):
if fields is None:
fields = []
self._properties = {"fields": [field.to_api_repr() for field in fields]}
@property
def fields(self) -> List[StandardSqlField]:
"""The fields in this struct."""
result = []
for field_resource in self._properties.get("fields", []):
field = StandardSqlField()
field._properties = field_resource # We do not use a copy on purpose.
result.append(field)
return result
@fields.setter
def fields(self, value: Iterable[StandardSqlField]):
self._properties["fields"] = [field.to_api_repr() for field in value]
def to_api_repr(self) -> Dict[str, Any]:
"""Construct the API resource representation of this SQL struct type."""
return copy.deepcopy(self._properties)
@classmethod
def from_api_repr(cls, resource: Dict[str, Any]) -> "StandardSqlStructType":
"""Construct an SQL struct type instance given its API representation."""
fields = (
StandardSqlField.from_api_repr(field_resource)
for field_resource in resource.get("fields", [])
)
return cls(fields=fields)
def __eq__(self, other):
if not isinstance(other, StandardSqlStructType):
return NotImplemented
else:
return self.fields == other.fields
class StandardSqlTableType:
"""A table type.
See:
https://cloud.google.com/workflows/docs/reference/googleapis/bigquery/v2/Overview#StandardSqlTableType
Args:
columns: The columns in this table type.
"""
def __init__(self, columns: Iterable[StandardSqlField]):
self._properties = {"columns": [col.to_api_repr() for col in columns]}
@property
def columns(self) -> List[StandardSqlField]:
"""The columns in this table type."""
result = []
for column_resource in self._properties.get("columns", []):
column = StandardSqlField()
column._properties = column_resource # We do not use a copy on purpose.
result.append(column)
return result
@columns.setter
def columns(self, value: Iterable[StandardSqlField]):
self._properties["columns"] = [col.to_api_repr() for col in value]
def to_api_repr(self) -> Dict[str, Any]:
"""Construct the API resource representation of this SQL table type."""
return copy.deepcopy(self._properties)
@classmethod
def from_api_repr(cls, resource: Dict[str, Any]) -> "StandardSqlTableType":
"""Construct an SQL table type instance given its API representation."""
columns = []
for column_resource in resource.get("columns", []):
type_ = column_resource.get("type")
if type_ is None:
type_ = {}
column = StandardSqlField(
name=column_resource.get("name"),
type=StandardSqlDataType.from_api_repr(type_),
)
columns.append(column)
return cls(columns=columns)
def __eq__(self, other):
if not isinstance(other, StandardSqlTableType):
return NotImplemented
else:
return self.columns == other.columns

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "3.31.0"