Files
evo-ai/.venv/lib/python3.10/site-packages/google/cloud/bigquery/magics/magics.py
2025-04-25 15:30:54 -03:00

777 lines
26 KiB
Python

# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""IPython Magics
Install ``bigquery-magics`` and call ``%load_ext bigquery_magics`` to use the
``%%bigquery`` cell magic.
See the `BigQuery Magics reference documentation
<https://googleapis.dev/python/bigquery-magics/latest/>`_.
"""
from __future__ import print_function
import re
import ast
import copy
import functools
import sys
import time
import warnings
from concurrent import futures
try:
import IPython # type: ignore
from IPython import display # type: ignore
from IPython.core import magic_arguments # type: ignore
except ImportError:
raise ImportError("This module can only be loaded in IPython.")
from google.api_core import client_info
from google.api_core import client_options
from google.api_core.exceptions import NotFound
import google.auth # type: ignore
from google.cloud import bigquery
import google.cloud.bigquery.dataset
from google.cloud.bigquery import _versions_helpers
from google.cloud.bigquery import exceptions
from google.cloud.bigquery.dbapi import _helpers
from google.cloud.bigquery.magics import line_arg_parser as lap
try:
import bigquery_magics # type: ignore
except ImportError:
bigquery_magics = None
IPYTHON_USER_AGENT = "ipython-{}".format(IPython.__version__) # type: ignore
class Context(object):
"""Storage for objects to be used throughout an IPython notebook session.
A Context object is initialized when the ``magics`` module is imported,
and can be found at ``google.cloud.bigquery.magics.context``.
"""
def __init__(self):
self._credentials = None
self._project = None
self._connection = None
self._default_query_job_config = bigquery.QueryJobConfig()
self._bigquery_client_options = client_options.ClientOptions()
self._bqstorage_client_options = client_options.ClientOptions()
self._progress_bar_type = "tqdm_notebook"
@property
def credentials(self):
"""google.auth.credentials.Credentials: Credentials to use for queries
performed through IPython magics.
Note:
These credentials do not need to be explicitly defined if you are
using Application Default Credentials. If you are not using
Application Default Credentials, manually construct a
:class:`google.auth.credentials.Credentials` object and set it as
the context credentials as demonstrated in the example below. See
`auth docs`_ for more information on obtaining credentials.
Example:
Manually setting the context credentials:
>>> from google.cloud.bigquery import magics
>>> from google.oauth2 import service_account
>>> credentials = (service_account
... .Credentials.from_service_account_file(
... '/path/to/key.json'))
>>> magics.context.credentials = credentials
.. _auth docs: http://google-auth.readthedocs.io
/en/latest/user-guide.html#obtaining-credentials
"""
if self._credentials is None:
self._credentials, _ = google.auth.default()
return self._credentials
@credentials.setter
def credentials(self, value):
self._credentials = value
@property
def project(self):
"""str: Default project to use for queries performed through IPython
magics.
Note:
The project does not need to be explicitly defined if you have an
environment default project set. If you do not have a default
project set in your environment, manually assign the project as
demonstrated in the example below.
Example:
Manually setting the context project:
>>> from google.cloud.bigquery import magics
>>> magics.context.project = 'my-project'
"""
if self._project is None:
_, self._project = google.auth.default()
return self._project
@project.setter
def project(self, value):
self._project = value
@property
def bigquery_client_options(self):
"""google.api_core.client_options.ClientOptions: client options to be
used through IPython magics.
Note::
The client options do not need to be explicitly defined if no
special network connections are required. Normally you would be
using the https://bigquery.googleapis.com/ end point.
Example:
Manually setting the endpoint:
>>> from google.cloud.bigquery import magics
>>> client_options = {}
>>> client_options['api_endpoint'] = "https://some.special.url"
>>> magics.context.bigquery_client_options = client_options
"""
return self._bigquery_client_options
@bigquery_client_options.setter
def bigquery_client_options(self, value):
self._bigquery_client_options = value
@property
def bqstorage_client_options(self):
"""google.api_core.client_options.ClientOptions: client options to be
used through IPython magics for the storage client.
Note::
The client options do not need to be explicitly defined if no
special network connections are required. Normally you would be
using the https://bigquerystorage.googleapis.com/ end point.
Example:
Manually setting the endpoint:
>>> from google.cloud.bigquery import magics
>>> client_options = {}
>>> client_options['api_endpoint'] = "https://some.special.url"
>>> magics.context.bqstorage_client_options = client_options
"""
return self._bqstorage_client_options
@bqstorage_client_options.setter
def bqstorage_client_options(self, value):
self._bqstorage_client_options = value
@property
def default_query_job_config(self):
"""google.cloud.bigquery.job.QueryJobConfig: Default job
configuration for queries.
The context's :class:`~google.cloud.bigquery.job.QueryJobConfig` is
used for queries. Some properties can be overridden with arguments to
the magics.
Example:
Manually setting the default value for ``maximum_bytes_billed``
to 100 MB:
>>> from google.cloud.bigquery import magics
>>> magics.context.default_query_job_config.maximum_bytes_billed = 100000000
"""
return self._default_query_job_config
@default_query_job_config.setter
def default_query_job_config(self, value):
self._default_query_job_config = value
@property
def progress_bar_type(self):
"""str: Default progress bar type to use to display progress bar while
executing queries through IPython magics.
Note::
Install the ``tqdm`` package to use this feature.
Example:
Manually setting the progress_bar_type:
>>> from google.cloud.bigquery import magics
>>> magics.context.progress_bar_type = "tqdm_notebook"
"""
return self._progress_bar_type
@progress_bar_type.setter
def progress_bar_type(self, value):
self._progress_bar_type = value
# If bigquery_magics is available, we load that extension rather than this one.
# Ensure google.cloud.bigquery.magics.context setters are on the correct magics
# implementation in case the user has installed the package but hasn't updated
# their code.
if bigquery_magics is not None:
context = bigquery_magics.context
else:
context = Context()
def _handle_error(error, destination_var=None):
"""Process a query execution error.
Args:
error (Exception):
An exception that occurred during the query execution.
destination_var (Optional[str]):
The name of the IPython session variable to store the query job.
"""
if destination_var:
query_job = getattr(error, "query_job", None)
if query_job is not None:
IPython.get_ipython().push({destination_var: query_job})
else:
# this is the case when previewing table rows by providing just
# table ID to cell magic
print(
"Could not save output to variable '{}'.".format(destination_var),
file=sys.stderr,
)
print("\nERROR:\n", str(error), file=sys.stderr)
def _run_query(client, query, job_config=None):
"""Runs a query while printing status updates
Args:
client (google.cloud.bigquery.client.Client):
Client to bundle configuration needed for API requests.
query (str):
SQL query to be executed. Defaults to the standard SQL dialect.
Use the ``job_config`` parameter to change dialects.
job_config (Optional[google.cloud.bigquery.job.QueryJobConfig]):
Extra configuration options for the job.
Returns:
google.cloud.bigquery.job.QueryJob: the query job created
Example:
>>> client = bigquery.Client()
>>> _run_query(client, "SELECT 17")
Executing query with job ID: bf633912-af2c-4780-b568-5d868058632b
Query executing: 1.66s
Query complete after 2.07s
'bf633912-af2c-4780-b568-5d868058632b'
"""
start_time = time.perf_counter()
query_job = client.query(query, job_config=job_config)
if job_config and job_config.dry_run:
return query_job
print(f"Executing query with job ID: {query_job.job_id}")
while True:
print(
f"\rQuery executing: {time.perf_counter() - start_time:.2f}s".format(),
end="",
)
try:
query_job.result(timeout=0.5)
break
except futures.TimeoutError:
continue
print(f"\nJob ID {query_job.job_id} successfully executed")
return query_job
def _create_dataset_if_necessary(client, dataset_id):
"""Create a dataset in the current project if it doesn't exist.
Args:
client (google.cloud.bigquery.client.Client):
Client to bundle configuration needed for API requests.
dataset_id (str):
Dataset id.
"""
dataset_reference = bigquery.dataset.DatasetReference(client.project, dataset_id)
try:
dataset = client.get_dataset(dataset_reference)
return
except NotFound:
pass
dataset = bigquery.Dataset(dataset_reference)
dataset.location = client.location
print(f"Creating dataset: {dataset_id}")
dataset = client.create_dataset(dataset)
@magic_arguments.magic_arguments()
@magic_arguments.argument(
"destination_var",
nargs="?",
help=("If provided, save the output to this variable instead of displaying it."),
)
@magic_arguments.argument(
"--destination_table",
type=str,
default=None,
help=(
"If provided, save the output of the query to a new BigQuery table. "
"Variable should be in a format <dataset_id>.<table_id>. "
"If table does not exists, it will be created. "
"If table already exists, its data will be overwritten."
),
)
@magic_arguments.argument(
"--project",
type=str,
default=None,
help=("Project to use for executing this query. Defaults to the context project."),
)
@magic_arguments.argument(
"--max_results",
default=None,
help=(
"Maximum number of rows in dataframe returned from executing the query."
"Defaults to returning all rows."
),
)
@magic_arguments.argument(
"--maximum_bytes_billed",
default=None,
help=(
"maximum_bytes_billed to use for executing this query. Defaults to "
"the context default_query_job_config.maximum_bytes_billed."
),
)
@magic_arguments.argument(
"--dry_run",
action="store_true",
default=False,
help=(
"Sets query to be a dry run to estimate costs. "
"Defaults to executing the query instead of dry run if this argument is not used."
),
)
@magic_arguments.argument(
"--use_legacy_sql",
action="store_true",
default=False,
help=(
"Sets query to use Legacy SQL instead of Standard SQL. Defaults to "
"Standard SQL if this argument is not used."
),
)
@magic_arguments.argument(
"--bigquery_api_endpoint",
type=str,
default=None,
help=(
"The desired API endpoint, e.g., bigquery.googlepis.com. Defaults to this "
"option's value in the context bigquery_client_options."
),
)
@magic_arguments.argument(
"--bqstorage_api_endpoint",
type=str,
default=None,
help=(
"The desired API endpoint, e.g., bigquerystorage.googlepis.com. Defaults to "
"this option's value in the context bqstorage_client_options."
),
)
@magic_arguments.argument(
"--no_query_cache",
action="store_true",
default=False,
help=("Do not use cached query results."),
)
@magic_arguments.argument(
"--use_bqstorage_api",
action="store_true",
default=None,
help=(
"[Deprecated] The BigQuery Storage API is already used by default to "
"download large query results, and this option has no effect. "
"If you want to switch to the classic REST API instead, use the "
"--use_rest_api option."
),
)
@magic_arguments.argument(
"--use_rest_api",
action="store_true",
default=False,
help=(
"Use the classic REST API instead of the BigQuery Storage API to "
"download query results."
),
)
@magic_arguments.argument(
"--verbose",
action="store_true",
default=False,
help=(
"If set, print verbose output, including the query job ID and the "
"amount of time for the query to finish. By default, this "
"information will be displayed as the query runs, but will be "
"cleared after the query is finished."
),
)
@magic_arguments.argument(
"--params",
nargs="+",
default=None,
help=(
"Parameters to format the query string. If present, the --params "
"flag should be followed by a string representation of a dictionary "
"in the format {'param_name': 'param_value'} (ex. {\"num\": 17}), "
"or a reference to a dictionary in the same format. The dictionary "
"reference can be made by including a '$' before the variable "
"name (ex. $my_dict_var)."
),
)
@magic_arguments.argument(
"--progress_bar_type",
type=str,
default=None,
help=(
"Sets progress bar type to display a progress bar while executing the query."
"Defaults to use tqdm_notebook. Install the ``tqdm`` package to use this feature."
),
)
@magic_arguments.argument(
"--location",
type=str,
default=None,
help=(
"Set the location to execute query."
"Defaults to location set in query setting in console."
),
)
def _cell_magic(line, query):
"""Underlying function for bigquery cell magic
Note:
This function contains the underlying logic for the 'bigquery' cell
magic. This function is not meant to be called directly.
Args:
line (str): "%%bigquery" followed by arguments as required
query (str): SQL query to run
Returns:
pandas.DataFrame: the query results.
"""
# The built-in parser does not recognize Python structures such as dicts, thus
# we extract the "--params" option and inteprpret it separately.
try:
params_option_value, rest_of_args = _split_args_line(line)
except lap.exceptions.QueryParamsParseError as exc:
rebranded_error = SyntaxError(
"--params is not a correctly formatted JSON string or a JSON "
"serializable dictionary"
)
raise rebranded_error from exc
except lap.exceptions.DuplicateQueryParamsError as exc:
rebranded_error = ValueError("Duplicate --params option.")
raise rebranded_error from exc
except lap.exceptions.ParseError as exc:
rebranded_error = ValueError(
"Unrecognized input, are option values correct? "
"Error details: {}".format(exc.args[0])
)
raise rebranded_error from exc
args = magic_arguments.parse_argstring(_cell_magic, rest_of_args)
if args.use_bqstorage_api is not None:
warnings.warn(
"Deprecated option --use_bqstorage_api, the BigQuery "
"Storage API is already used by default.",
category=DeprecationWarning,
)
use_bqstorage_api = not args.use_rest_api
location = args.location
params = []
if params_option_value:
# A non-existing params variable is not expanded and ends up in the input
# in its raw form, e.g. "$query_params".
if params_option_value.startswith("$"):
msg = 'Parameter expansion failed, undefined variable "{}".'.format(
params_option_value[1:]
)
raise NameError(msg)
params = _helpers.to_query_parameters(ast.literal_eval(params_option_value), {})
project = args.project or context.project
bigquery_client_options = copy.deepcopy(context.bigquery_client_options)
if args.bigquery_api_endpoint:
if isinstance(bigquery_client_options, dict):
bigquery_client_options["api_endpoint"] = args.bigquery_api_endpoint
else:
bigquery_client_options.api_endpoint = args.bigquery_api_endpoint
client = bigquery.Client(
project=project,
credentials=context.credentials,
default_query_job_config=context.default_query_job_config,
client_info=client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT),
client_options=bigquery_client_options,
location=location,
)
if context._connection:
client._connection = context._connection
bqstorage_client_options = copy.deepcopy(context.bqstorage_client_options)
if args.bqstorage_api_endpoint:
if isinstance(bqstorage_client_options, dict):
bqstorage_client_options["api_endpoint"] = args.bqstorage_api_endpoint
else:
bqstorage_client_options.api_endpoint = args.bqstorage_api_endpoint
bqstorage_client = _make_bqstorage_client(
client,
use_bqstorage_api,
bqstorage_client_options,
)
close_transports = functools.partial(_close_transports, client, bqstorage_client)
try:
if args.max_results:
max_results = int(args.max_results)
else:
max_results = None
query = query.strip()
if not query:
error = ValueError("Query is missing.")
_handle_error(error, args.destination_var)
return
# Check if query is given as a reference to a variable.
if query.startswith("$"):
query_var_name = query[1:]
if not query_var_name:
missing_msg = 'Missing query variable name, empty "$" is not allowed.'
raise NameError(missing_msg)
if query_var_name.isidentifier():
ip = IPython.get_ipython()
query = ip.user_ns.get(query_var_name, ip) # ip serves as a sentinel
if query is ip:
raise NameError(
f"Unknown query, variable {query_var_name} does not exist."
)
else:
if not isinstance(query, (str, bytes)):
raise TypeError(
f"Query variable {query_var_name} must be a string "
"or a bytes-like value."
)
# Any query that does not contain whitespace (aside from leading and trailing whitespace)
# is assumed to be a table id
if not re.search(r"\s", query):
try:
rows = client.list_rows(query, max_results=max_results)
except Exception as ex:
_handle_error(ex, args.destination_var)
return
result = rows.to_dataframe(
bqstorage_client=bqstorage_client,
create_bqstorage_client=False,
)
if args.destination_var:
IPython.get_ipython().push({args.destination_var: result})
return
else:
return result
job_config = bigquery.job.QueryJobConfig()
job_config.query_parameters = params
job_config.use_legacy_sql = args.use_legacy_sql
job_config.dry_run = args.dry_run
# Don't override context job config unless --no_query_cache is explicitly set.
if args.no_query_cache:
job_config.use_query_cache = False
if args.destination_table:
split = args.destination_table.split(".")
if len(split) != 2:
raise ValueError(
"--destination_table should be in a <dataset_id>.<table_id> format."
)
dataset_id, table_id = split
job_config.allow_large_results = True
dataset_ref = bigquery.dataset.DatasetReference(client.project, dataset_id)
destination_table_ref = dataset_ref.table(table_id)
job_config.destination = destination_table_ref
job_config.create_disposition = "CREATE_IF_NEEDED"
job_config.write_disposition = "WRITE_TRUNCATE"
_create_dataset_if_necessary(client, dataset_id)
if args.maximum_bytes_billed == "None":
job_config.maximum_bytes_billed = 0
elif args.maximum_bytes_billed is not None:
value = int(args.maximum_bytes_billed)
job_config.maximum_bytes_billed = value
try:
query_job = _run_query(client, query, job_config=job_config)
except Exception as ex:
_handle_error(ex, args.destination_var)
return
if not args.verbose:
display.clear_output()
if args.dry_run and args.destination_var:
IPython.get_ipython().push({args.destination_var: query_job})
return
elif args.dry_run:
print(
"Query validated. This query will process {} bytes.".format(
query_job.total_bytes_processed
)
)
return query_job
progress_bar = context.progress_bar_type or args.progress_bar_type
if max_results:
result = query_job.result(max_results=max_results).to_dataframe(
bqstorage_client=None,
create_bqstorage_client=False,
progress_bar_type=progress_bar,
)
else:
result = query_job.to_dataframe(
bqstorage_client=bqstorage_client,
create_bqstorage_client=False,
progress_bar_type=progress_bar,
)
if args.destination_var:
IPython.get_ipython().push({args.destination_var: result})
else:
return result
finally:
close_transports()
def _split_args_line(line):
"""Split out the --params option value from the input line arguments.
Args:
line (str): The line arguments passed to the cell magic.
Returns:
Tuple[str, str]
"""
lexer = lap.Lexer(line)
scanner = lap.Parser(lexer)
tree = scanner.input_line()
extractor = lap.QueryParamsExtractor()
params_option_value, rest_of_args = extractor.visit(tree)
return params_option_value, rest_of_args
def _make_bqstorage_client(client, use_bqstorage_api, client_options):
"""Creates a BigQuery Storage client.
Args:
client (:class:`~google.cloud.bigquery.client.Client`): BigQuery client.
use_bqstorage_api (bool): whether BigQuery Storage API is used or not.
client_options (:class:`google.api_core.client_options.ClientOptions`):
Custom options used with a new BigQuery Storage client instance
if one is created.
Raises:
ImportError: if google-cloud-bigquery-storage is not installed, or
grpcio package is not installed.
Returns:
None: if ``use_bqstorage_api == False``, or google-cloud-bigquery-storage
is outdated.
BigQuery Storage Client:
"""
if not use_bqstorage_api:
return None
try:
_versions_helpers.BQ_STORAGE_VERSIONS.try_import(raise_if_error=True)
except exceptions.BigQueryStorageNotFoundError as err:
customized_error = ImportError(
"The default BigQuery Storage API client cannot be used, install "
"the missing google-cloud-bigquery-storage and pyarrow packages "
"to use it. Alternatively, use the classic REST API by specifying "
"the --use_rest_api magic option."
)
raise customized_error from err
except exceptions.LegacyBigQueryStorageError:
pass
try:
from google.api_core.gapic_v1 import client_info as gapic_client_info
except ImportError as err:
customized_error = ImportError(
"Install the grpcio package to use the BigQuery Storage API."
)
raise customized_error from err
return client._ensure_bqstorage_client(
client_options=client_options,
client_info=gapic_client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT),
)
def _close_transports(client, bqstorage_client):
"""Close the given clients' underlying transport channels.
Closing the transport is needed to release system resources, namely open
sockets.
Args:
client (:class:`~google.cloud.bigquery.client.Client`):
bqstorage_client
(Optional[:class:`~google.cloud.bigquery_storage.BigQueryReadClient`]):
A client for the BigQuery Storage API.
"""
client.close()
if bqstorage_client is not None:
bqstorage_client._transport.grpc_channel.close()