# Copyright 2015 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Classes for query jobs.""" import concurrent.futures import copy import re import time import typing from typing import Any, Dict, Iterable, List, Optional, Union from google.api_core import exceptions from google.api_core import retry as retries import requests from google.cloud.bigquery.dataset import Dataset from google.cloud.bigquery.dataset import DatasetListItem from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration from google.cloud.bigquery.enums import KeyResultStatementKind, DefaultPandasDTypes from google.cloud.bigquery.external_config import ExternalConfig from google.cloud.bigquery import _helpers from google.cloud.bigquery.query import ( _query_param_from_api_repr, ArrayQueryParameter, ConnectionProperty, ScalarQueryParameter, StructQueryParameter, UDFResource, ) from google.cloud.bigquery.retry import ( DEFAULT_RETRY, DEFAULT_JOB_RETRY, POLLING_DEFAULT_VALUE, ) from google.cloud.bigquery.routine import RoutineReference from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.table import _EmptyRowIterator from google.cloud.bigquery.table import RangePartitioning from google.cloud.bigquery.table import _table_arg_to_table_ref from google.cloud.bigquery.table import TableReference from google.cloud.bigquery.table import TimePartitioning from google.cloud.bigquery._tqdm_helpers import wait_for_query from google.cloud.bigquery.job.base import _AsyncJob from google.cloud.bigquery.job.base import _JobConfig from google.cloud.bigquery.job.base import _JobReference try: import pandas # type: ignore except ImportError: pandas = None if typing.TYPE_CHECKING: # pragma: NO COVER # Assumption: type checks are only used by library developers and CI environments # that have all optional dependencies installed, thus no conditional imports. import pandas # type: ignore import geopandas # type: ignore import pyarrow # type: ignore from google.cloud import bigquery_storage from google.cloud.bigquery.client import Client from google.cloud.bigquery.table import RowIterator _CONTAINS_ORDER_BY = re.compile(r"ORDER\s+BY", re.IGNORECASE) _EXCEPTION_FOOTER_TEMPLATE = "{message}\n\nLocation: {location}\nJob ID: {job_id}\n" _TIMEOUT_BUFFER_SECS = 0.1 def _contains_order_by(query): """Do we need to preserve the order of the query results? This function has known false positives, such as with ordered window functions: .. code-block:: sql SELECT SUM(x) OVER ( window_name PARTITION BY... ORDER BY... window_frame_clause) FROM ... This false positive failure case means the behavior will be correct, but downloading results with the BigQuery Storage API may be slower than it otherwise would. This is preferable to the false negative case, where results are expected to be in order but are not (due to parallel reads). """ return query and _CONTAINS_ORDER_BY.search(query) def _from_api_repr_query_parameters(resource): return [_query_param_from_api_repr(mapping) for mapping in resource] def _to_api_repr_query_parameters(value): return [query_parameter.to_api_repr() for query_parameter in value] def _from_api_repr_udf_resources(resource): udf_resources = [] for udf_mapping in resource: for udf_type, udf_value in udf_mapping.items(): udf_resources.append(UDFResource(udf_type, udf_value)) return udf_resources def _to_api_repr_udf_resources(value): return [{udf_resource.udf_type: udf_resource.value} for udf_resource in value] def _from_api_repr_table_defs(resource): return {k: ExternalConfig.from_api_repr(v) for k, v in resource.items()} def _to_api_repr_table_defs(value): return {k: ExternalConfig.to_api_repr(v) for k, v in value.items()} class BiEngineReason(typing.NamedTuple): """Reason for BI Engine acceleration failure https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#bienginereason """ code: str = "CODE_UNSPECIFIED" reason: str = "" @classmethod def from_api_repr(cls, reason: Dict[str, str]) -> "BiEngineReason": return cls(reason.get("code", "CODE_UNSPECIFIED"), reason.get("message", "")) class BiEngineStats(typing.NamedTuple): """Statistics for a BI Engine query https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#bienginestatistics """ mode: str = "ACCELERATION_MODE_UNSPECIFIED" """ Specifies which mode of BI Engine acceleration was performed (if any) """ reasons: List[BiEngineReason] = [] """ Contains explanatory messages in case of DISABLED / PARTIAL acceleration """ @classmethod def from_api_repr(cls, stats: Dict[str, Any]) -> "BiEngineStats": mode = stats.get("biEngineMode", "ACCELERATION_MODE_UNSPECIFIED") reasons = [ BiEngineReason.from_api_repr(r) for r in stats.get("biEngineReasons", []) ] return cls(mode, reasons) class DmlStats(typing.NamedTuple): """Detailed statistics for DML statements. https://cloud.google.com/bigquery/docs/reference/rest/v2/DmlStats """ inserted_row_count: int = 0 """Number of inserted rows. Populated by DML INSERT and MERGE statements.""" deleted_row_count: int = 0 """Number of deleted rows. populated by DML DELETE, MERGE and TRUNCATE statements. """ updated_row_count: int = 0 """Number of updated rows. Populated by DML UPDATE and MERGE statements.""" @classmethod def from_api_repr(cls, stats: Dict[str, str]) -> "DmlStats": # NOTE: The field order here must match the order of fields set at the # class level. api_fields = ("insertedRowCount", "deletedRowCount", "updatedRowCount") args = ( int(stats.get(api_field, default_val)) for api_field, default_val in zip(api_fields, cls.__new__.__defaults__) # type: ignore ) return cls(*args) class IndexUnusedReason(typing.NamedTuple): """Reason about why no search index was used in the search query (or sub-query). https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#indexunusedreason """ code: Optional[str] = None """Specifies the high-level reason for the scenario when no search index was used. """ message: Optional[str] = None """Free form human-readable reason for the scenario when no search index was used. """ baseTable: Optional[TableReference] = None """Specifies the base table involved in the reason that no search index was used. """ indexName: Optional[str] = None """Specifies the name of the unused search index, if available.""" @classmethod def from_api_repr(cls, reason): code = reason.get("code") message = reason.get("message") baseTable = reason.get("baseTable") indexName = reason.get("indexName") return cls(code, message, baseTable, indexName) class SearchStats(typing.NamedTuple): """Statistics related to Search Queries. Populated as part of JobStatistics2. https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#searchstatistics """ mode: Optional[str] = None """Indicates the type of search index usage in the entire search query.""" reason: List[IndexUnusedReason] = [] """Reason about why no search index was used in the search query (or sub-query)""" @classmethod def from_api_repr(cls, stats: Dict[str, Any]): mode = stats.get("indexUsageMode", None) reason = [ IndexUnusedReason.from_api_repr(r) for r in stats.get("indexUnusedReasons", []) ] return cls(mode, reason) class ScriptOptions: """Options controlling the execution of scripts. https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#ScriptOptions """ def __init__( self, statement_timeout_ms: Optional[int] = None, statement_byte_budget: Optional[int] = None, key_result_statement: Optional[KeyResultStatementKind] = None, ): self._properties: Dict[str, Any] = {} self.statement_timeout_ms = statement_timeout_ms self.statement_byte_budget = statement_byte_budget self.key_result_statement = key_result_statement @classmethod def from_api_repr(cls, resource: Dict[str, Any]) -> "ScriptOptions": """Factory: construct instance from the JSON repr. Args: resource(Dict[str: Any]): ScriptOptions representation returned from API. Returns: google.cloud.bigquery.ScriptOptions: ScriptOptions sample parsed from ``resource``. """ entry = cls() entry._properties = copy.deepcopy(resource) return entry def to_api_repr(self) -> Dict[str, Any]: """Construct the API resource representation.""" return copy.deepcopy(self._properties) @property def statement_timeout_ms(self) -> Union[int, None]: """Timeout period for each statement in a script.""" return _helpers._int_or_none(self._properties.get("statementTimeoutMs")) @statement_timeout_ms.setter def statement_timeout_ms(self, value: Union[int, None]): new_value = None if value is None else str(value) self._properties["statementTimeoutMs"] = new_value @property def statement_byte_budget(self) -> Union[int, None]: """Limit on the number of bytes billed per statement. Exceeding this budget results in an error. """ return _helpers._int_or_none(self._properties.get("statementByteBudget")) @statement_byte_budget.setter def statement_byte_budget(self, value: Union[int, None]): new_value = None if value is None else str(value) self._properties["statementByteBudget"] = new_value @property def key_result_statement(self) -> Union[KeyResultStatementKind, None]: """Determines which statement in the script represents the "key result". This is used to populate the schema and query results of the script job. Default is ``KeyResultStatementKind.LAST``. """ return self._properties.get("keyResultStatement") @key_result_statement.setter def key_result_statement(self, value: Union[KeyResultStatementKind, None]): self._properties["keyResultStatement"] = value class QueryJobConfig(_JobConfig): """Configuration options for query jobs. All properties in this class are optional. Values which are :data:`None` -> server defaults. Set properties on the constructed configuration by using the property name as the name of a keyword argument. """ def __init__(self, **kwargs) -> None: super(QueryJobConfig, self).__init__("query", **kwargs) @property def destination_encryption_configuration(self): """google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom encryption configuration for the destination table. Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None` if using default encryption. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.destination_encryption_configuration """ prop = self._get_sub_prop("destinationEncryptionConfiguration") if prop is not None: prop = EncryptionConfiguration.from_api_repr(prop) return prop @destination_encryption_configuration.setter def destination_encryption_configuration(self, value): api_repr = value if value is not None: api_repr = value.to_api_repr() self._set_sub_prop("destinationEncryptionConfiguration", api_repr) @property def allow_large_results(self): """bool: Allow large query results tables (legacy SQL, only) See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.allow_large_results """ return self._get_sub_prop("allowLargeResults") @allow_large_results.setter def allow_large_results(self, value): self._set_sub_prop("allowLargeResults", value) @property def connection_properties(self) -> List[ConnectionProperty]: """Connection properties. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.connection_properties .. versionadded:: 2.29.0 """ resource = self._get_sub_prop("connectionProperties", []) return [ConnectionProperty.from_api_repr(prop) for prop in resource] @connection_properties.setter def connection_properties(self, value: Iterable[ConnectionProperty]): self._set_sub_prop( "connectionProperties", [prop.to_api_repr() for prop in value], ) @property def create_disposition(self): """google.cloud.bigquery.job.CreateDisposition: Specifies behavior for creating tables. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.create_disposition """ return self._get_sub_prop("createDisposition") @create_disposition.setter def create_disposition(self, value): self._set_sub_prop("createDisposition", value) @property def create_session(self) -> Optional[bool]: """[Preview] If :data:`True`, creates a new session, where :attr:`~google.cloud.bigquery.job.QueryJob.session_info` will contain a random server generated session id. If :data:`False`, runs query with an existing ``session_id`` passed in :attr:`~google.cloud.bigquery.job.QueryJobConfig.connection_properties`, otherwise runs query in non-session mode. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.create_session .. versionadded:: 2.29.0 """ return self._get_sub_prop("createSession") @create_session.setter def create_session(self, value: Optional[bool]): self._set_sub_prop("createSession", value) @property def default_dataset(self): """google.cloud.bigquery.dataset.DatasetReference: the default dataset to use for unqualified table names in the query or :data:`None` if not set. The ``default_dataset`` setter accepts: - a :class:`~google.cloud.bigquery.dataset.Dataset`, or - a :class:`~google.cloud.bigquery.dataset.DatasetReference`, or - a :class:`str` of the fully-qualified dataset ID in standard SQL format. The value must included a project ID and dataset ID separated by ``.``. For example: ``your-project.your_dataset``. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.default_dataset """ prop = self._get_sub_prop("defaultDataset") if prop is not None: prop = DatasetReference.from_api_repr(prop) return prop @default_dataset.setter def default_dataset(self, value): if value is None: self._set_sub_prop("defaultDataset", None) return if isinstance(value, str): value = DatasetReference.from_string(value) if isinstance(value, (Dataset, DatasetListItem)): value = value.reference resource = value.to_api_repr() self._set_sub_prop("defaultDataset", resource) @property def destination(self): """google.cloud.bigquery.table.TableReference: table where results are written or :data:`None` if not set. The ``destination`` setter accepts: - a :class:`~google.cloud.bigquery.table.Table`, or - a :class:`~google.cloud.bigquery.table.TableReference`, or - a :class:`str` of the fully-qualified table ID in standard SQL format. The value must included a project ID, dataset ID, and table ID, each separated by ``.``. For example: ``your-project.your_dataset.your_table``. .. note:: Only table ID is passed to the backend, so any configuration in `~google.cloud.bigquery.table.Table` is discarded. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.destination_table """ prop = self._get_sub_prop("destinationTable") if prop is not None: prop = TableReference.from_api_repr(prop) return prop @destination.setter def destination(self, value): if value is None: self._set_sub_prop("destinationTable", None) return value = _table_arg_to_table_ref(value) resource = value.to_api_repr() self._set_sub_prop("destinationTable", resource) @property def dry_run(self): """bool: :data:`True` if this query should be a dry run to estimate costs. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfiguration.FIELDS.dry_run """ return self._properties.get("dryRun") @dry_run.setter def dry_run(self, value): self._properties["dryRun"] = value @property def flatten_results(self): """bool: Flatten nested/repeated fields in results. (Legacy SQL only) See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.flatten_results """ return self._get_sub_prop("flattenResults") @flatten_results.setter def flatten_results(self, value): self._set_sub_prop("flattenResults", value) @property def maximum_billing_tier(self): """int: Deprecated. Changes the billing tier to allow high-compute queries. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.maximum_billing_tier """ return self._get_sub_prop("maximumBillingTier") @maximum_billing_tier.setter def maximum_billing_tier(self, value): self._set_sub_prop("maximumBillingTier", value) @property def maximum_bytes_billed(self): """int: Maximum bytes to be billed for this job or :data:`None` if not set. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.maximum_bytes_billed """ return _helpers._int_or_none(self._get_sub_prop("maximumBytesBilled")) @maximum_bytes_billed.setter def maximum_bytes_billed(self, value): self._set_sub_prop("maximumBytesBilled", str(value)) @property def priority(self): """google.cloud.bigquery.job.QueryPriority: Priority of the query. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.priority """ return self._get_sub_prop("priority") @priority.setter def priority(self, value): self._set_sub_prop("priority", value) @property def query_parameters(self): """List[Union[google.cloud.bigquery.query.ArrayQueryParameter, \ google.cloud.bigquery.query.ScalarQueryParameter, \ google.cloud.bigquery.query.StructQueryParameter]]: list of parameters for parameterized query (empty by default) See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.query_parameters """ prop = self._get_sub_prop("queryParameters", default=[]) return _from_api_repr_query_parameters(prop) @query_parameters.setter def query_parameters(self, values): self._set_sub_prop("queryParameters", _to_api_repr_query_parameters(values)) @property def range_partitioning(self): """Optional[google.cloud.bigquery.table.RangePartitioning]: Configures range-based partitioning for destination table. .. note:: **Beta**. The integer range partitioning feature is in a pre-release state and might change or have limited support. Only specify at most one of :attr:`~google.cloud.bigquery.job.LoadJobConfig.time_partitioning` or :attr:`~google.cloud.bigquery.job.LoadJobConfig.range_partitioning`. Raises: ValueError: If the value is not :class:`~google.cloud.bigquery.table.RangePartitioning` or :data:`None`. """ resource = self._get_sub_prop("rangePartitioning") if resource is not None: return RangePartitioning(_properties=resource) @range_partitioning.setter def range_partitioning(self, value): resource = value if isinstance(value, RangePartitioning): resource = value._properties elif value is not None: raise ValueError( "Expected value to be RangePartitioning or None, got {}.".format(value) ) self._set_sub_prop("rangePartitioning", resource) @property def udf_resources(self): """List[google.cloud.bigquery.query.UDFResource]: user defined function resources (empty by default) See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.user_defined_function_resources """ prop = self._get_sub_prop("userDefinedFunctionResources", default=[]) return _from_api_repr_udf_resources(prop) @udf_resources.setter def udf_resources(self, values): self._set_sub_prop( "userDefinedFunctionResources", _to_api_repr_udf_resources(values) ) @property def use_legacy_sql(self): """bool: Use legacy SQL syntax. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.use_legacy_sql """ return self._get_sub_prop("useLegacySql") @use_legacy_sql.setter def use_legacy_sql(self, value): self._set_sub_prop("useLegacySql", value) @property def use_query_cache(self): """bool: Look for the query result in the cache. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.use_query_cache """ return self._get_sub_prop("useQueryCache") @use_query_cache.setter def use_query_cache(self, value): self._set_sub_prop("useQueryCache", value) @property def write_disposition(self): """google.cloud.bigquery.job.WriteDisposition: Action that occurs if the destination table already exists. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.write_disposition """ return self._get_sub_prop("writeDisposition") @write_disposition.setter def write_disposition(self, value): self._set_sub_prop("writeDisposition", value) @property def table_definitions(self): """Dict[str, google.cloud.bigquery.external_config.ExternalConfig]: Definitions for external tables or :data:`None` if not set. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.external_table_definitions """ prop = self._get_sub_prop("tableDefinitions") if prop is not None: prop = _from_api_repr_table_defs(prop) return prop @table_definitions.setter def table_definitions(self, values): self._set_sub_prop("tableDefinitions", _to_api_repr_table_defs(values)) @property def time_partitioning(self): """Optional[google.cloud.bigquery.table.TimePartitioning]: Specifies time-based partitioning for the destination table. Only specify at most one of :attr:`~google.cloud.bigquery.job.LoadJobConfig.time_partitioning` or :attr:`~google.cloud.bigquery.job.LoadJobConfig.range_partitioning`. Raises: ValueError: If the value is not :class:`~google.cloud.bigquery.table.TimePartitioning` or :data:`None`. """ prop = self._get_sub_prop("timePartitioning") if prop is not None: prop = TimePartitioning.from_api_repr(prop) return prop @time_partitioning.setter def time_partitioning(self, value): api_repr = value if value is not None: api_repr = value.to_api_repr() self._set_sub_prop("timePartitioning", api_repr) @property def clustering_fields(self): """Optional[List[str]]: Fields defining clustering for the table (Defaults to :data:`None`). Clustering fields are immutable after table creation. .. note:: BigQuery supports clustering for both partitioned and non-partitioned tables. """ prop = self._get_sub_prop("clustering") if prop is not None: return list(prop.get("fields", ())) @clustering_fields.setter def clustering_fields(self, value): """Optional[List[str]]: Fields defining clustering for the table (Defaults to :data:`None`). """ if value is not None: self._set_sub_prop("clustering", {"fields": value}) else: self._del_sub_prop("clustering") @property def schema_update_options(self): """List[google.cloud.bigquery.job.SchemaUpdateOption]: Specifies updates to the destination table schema to allow as a side effect of the query job. """ return self._get_sub_prop("schemaUpdateOptions") @schema_update_options.setter def schema_update_options(self, values): self._set_sub_prop("schemaUpdateOptions", values) @property def script_options(self) -> ScriptOptions: """Options controlling the execution of scripts. https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#scriptoptions """ prop = self._get_sub_prop("scriptOptions") if prop is not None: prop = ScriptOptions.from_api_repr(prop) return prop @script_options.setter def script_options(self, value: Union[ScriptOptions, None]): new_value = None if value is None else value.to_api_repr() self._set_sub_prop("scriptOptions", new_value) def to_api_repr(self) -> dict: """Build an API representation of the query job config. Returns: Dict: A dictionary in the format used by the BigQuery API. """ resource = copy.deepcopy(self._properties) # Query parameters have an addition property associated with them # to indicate if the query is using named or positional parameters. query_parameters = resource.get("query", {}).get("queryParameters") if query_parameters: if query_parameters[0].get("name") is None: resource["query"]["parameterMode"] = "POSITIONAL" else: resource["query"]["parameterMode"] = "NAMED" return resource class QueryJob(_AsyncJob): """Asynchronous job: query tables. Args: job_id (str): the job's ID, within the project belonging to ``client``. query (str): SQL query string. client (google.cloud.bigquery.client.Client): A client which holds credentials and project configuration for the dataset (which requires a project). job_config (Optional[google.cloud.bigquery.job.QueryJobConfig]): Extra configuration options for the query job. """ _JOB_TYPE = "query" _UDF_KEY = "userDefinedFunctionResources" _CONFIG_CLASS = QueryJobConfig def __init__(self, job_id, query, client, job_config=None): super(QueryJob, self).__init__(job_id, client) if job_config is not None: self._properties["configuration"] = job_config._properties if self.configuration.use_legacy_sql is None: self.configuration.use_legacy_sql = False if query: _helpers._set_sub_prop( self._properties, ["configuration", "query", "query"], query ) self._query_results = None self._done_timeout = None self._transport_timeout = None @property def allow_large_results(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.allow_large_results`. """ return self.configuration.allow_large_results @property def configuration(self) -> QueryJobConfig: """The configuration for this query job.""" return typing.cast(QueryJobConfig, super().configuration) @property def connection_properties(self) -> List[ConnectionProperty]: """See :attr:`google.cloud.bigquery.job.QueryJobConfig.connection_properties`. .. versionadded:: 2.29.0 """ return self.configuration.connection_properties @property def create_disposition(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.create_disposition`. """ return self.configuration.create_disposition @property def create_session(self) -> Optional[bool]: """See :attr:`google.cloud.bigquery.job.QueryJobConfig.create_session`. .. versionadded:: 2.29.0 """ return self.configuration.create_session @property def default_dataset(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.default_dataset`. """ return self.configuration.default_dataset @property def destination(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.destination`. """ return self.configuration.destination @property def destination_encryption_configuration(self): """google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom encryption configuration for the destination table. Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None` if using default encryption. See :attr:`google.cloud.bigquery.job.QueryJobConfig.destination_encryption_configuration`. """ return self.configuration.destination_encryption_configuration @property def dry_run(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.dry_run`. """ return self.configuration.dry_run @property def flatten_results(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.flatten_results`. """ return self.configuration.flatten_results @property def priority(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.priority`. """ return self.configuration.priority @property def search_stats(self) -> Optional[SearchStats]: """Returns a SearchStats object.""" stats = self._job_statistics().get("searchStatistics") if stats is not None: return SearchStats.from_api_repr(stats) return None @property def query(self): """str: The query text used in this query job. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery.FIELDS.query """ return _helpers._get_sub_prop( self._properties, ["configuration", "query", "query"] ) @property def query_id(self) -> Optional[str]: """[Preview] ID of a completed query. This ID is auto-generated and not guaranteed to be populated. """ query_results = self._query_results return query_results.query_id if query_results is not None else None @property def query_parameters(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.query_parameters`. """ return self.configuration.query_parameters @property def udf_resources(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.udf_resources`. """ return self.configuration.udf_resources @property def use_legacy_sql(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.use_legacy_sql`. """ return self.configuration.use_legacy_sql @property def use_query_cache(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.use_query_cache`. """ return self.configuration.use_query_cache @property def write_disposition(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.write_disposition`. """ return self.configuration.write_disposition @property def maximum_billing_tier(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.maximum_billing_tier`. """ return self.configuration.maximum_billing_tier @property def maximum_bytes_billed(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.maximum_bytes_billed`. """ return self.configuration.maximum_bytes_billed @property def range_partitioning(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.range_partitioning`. """ return self.configuration.range_partitioning @property def table_definitions(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.table_definitions`. """ return self.configuration.table_definitions @property def time_partitioning(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.time_partitioning`. """ return self.configuration.time_partitioning @property def clustering_fields(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.clustering_fields`. """ return self.configuration.clustering_fields @property def schema_update_options(self): """See :attr:`google.cloud.bigquery.job.QueryJobConfig.schema_update_options`. """ return self.configuration.schema_update_options def to_api_repr(self): """Generate a resource for :meth:`_begin`.""" # Use to_api_repr to allow for some configuration properties to be set # automatically. configuration = self.configuration.to_api_repr() return { "jobReference": self._properties["jobReference"], "configuration": configuration, } @classmethod def from_api_repr(cls, resource: dict, client: "Client") -> "QueryJob": """Factory: construct a job given its API representation Args: resource (Dict): dataset job representation returned from the API client (google.cloud.bigquery.client.Client): Client which holds credentials and project configuration for the dataset. Returns: google.cloud.bigquery.job.QueryJob: Job parsed from ``resource``. """ job_ref_properties = resource.setdefault( "jobReference", {"projectId": client.project, "jobId": None} ) job_ref = _JobReference._from_api_repr(job_ref_properties) job = cls(job_ref, None, client=client) job._set_properties(resource) return job @property def query_plan(self): """Return query plan from job statistics, if present. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.query_plan Returns: List[google.cloud.bigquery.job.QueryPlanEntry]: mappings describing the query plan, or an empty list if the query has not yet completed. """ plan_entries = self._job_statistics().get("queryPlan", ()) return [QueryPlanEntry.from_api_repr(entry) for entry in plan_entries] @property def schema(self) -> Optional[List[SchemaField]]: """The schema of the results. Present only for successful dry run of non-legacy SQL queries. """ resource = self._job_statistics().get("schema") if resource is None: return None fields = resource.get("fields", []) return [SchemaField.from_api_repr(field) for field in fields] @property def timeline(self): """List(TimelineEntry): Return the query execution timeline from job statistics. """ raw = self._job_statistics().get("timeline", ()) return [TimelineEntry.from_api_repr(entry) for entry in raw] @property def total_bytes_processed(self): """Return total bytes processed from job statistics, if present. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.total_bytes_processed Returns: Optional[int]: Total bytes processed by the job, or None if job is not yet complete. """ result = self._job_statistics().get("totalBytesProcessed") if result is not None: result = int(result) return result @property def total_bytes_billed(self): """Return total bytes billed from job statistics, if present. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.total_bytes_billed Returns: Optional[int]: Total bytes processed by the job, or None if job is not yet complete. """ result = self._job_statistics().get("totalBytesBilled") if result is not None: result = int(result) return result @property def billing_tier(self): """Return billing tier from job statistics, if present. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.billing_tier Returns: Optional[int]: Billing tier used by the job, or None if job is not yet complete. """ return self._job_statistics().get("billingTier") @property def cache_hit(self): """Return whether or not query results were served from cache. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.cache_hit Returns: Optional[bool]: whether the query results were returned from cache, or None if job is not yet complete. """ return self._job_statistics().get("cacheHit") @property def ddl_operation_performed(self): """Optional[str]: Return the DDL operation performed. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.ddl_operation_performed """ return self._job_statistics().get("ddlOperationPerformed") @property def ddl_target_routine(self): """Optional[google.cloud.bigquery.routine.RoutineReference]: Return the DDL target routine, present for CREATE/DROP FUNCTION/PROCEDURE queries. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.ddl_target_routine """ prop = self._job_statistics().get("ddlTargetRoutine") if prop is not None: prop = RoutineReference.from_api_repr(prop) return prop @property def ddl_target_table(self): """Optional[google.cloud.bigquery.table.TableReference]: Return the DDL target table, present for CREATE/DROP TABLE/VIEW queries. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.ddl_target_table """ prop = self._job_statistics().get("ddlTargetTable") if prop is not None: prop = TableReference.from_api_repr(prop) return prop @property def num_dml_affected_rows(self) -> Optional[int]: """Return the number of DML rows affected by the job. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.num_dml_affected_rows Returns: Optional[int]: number of DML rows affected by the job, or None if job is not yet complete. """ result = self._job_statistics().get("numDmlAffectedRows") if result is not None: result = int(result) return result @property def slot_millis(self): """Union[int, None]: Slot-milliseconds used by this query job.""" return _helpers._int_or_none(self._job_statistics().get("totalSlotMs")) @property def statement_type(self): """Return statement type from job statistics, if present. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.statement_type Returns: Optional[str]: type of statement used by the job, or None if job is not yet complete. """ return self._job_statistics().get("statementType") @property def referenced_tables(self): """Return referenced tables from job statistics, if present. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.referenced_tables Returns: List[Dict]: mappings describing the query plan, or an empty list if the query has not yet completed. """ tables = [] datasets_by_project_name = {} for table in self._job_statistics().get("referencedTables", ()): t_project = table["projectId"] ds_id = table["datasetId"] t_dataset = datasets_by_project_name.get((t_project, ds_id)) if t_dataset is None: t_dataset = DatasetReference(t_project, ds_id) datasets_by_project_name[(t_project, ds_id)] = t_dataset t_name = table["tableId"] tables.append(t_dataset.table(t_name)) return tables @property def undeclared_query_parameters(self): """Return undeclared query parameters from job statistics, if present. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.undeclared_query_parameters Returns: List[Union[ \ google.cloud.bigquery.query.ArrayQueryParameter, \ google.cloud.bigquery.query.ScalarQueryParameter, \ google.cloud.bigquery.query.StructQueryParameter \ ]]: Undeclared parameters, or an empty list if the query has not yet completed. """ parameters = [] undeclared = self._job_statistics().get("undeclaredQueryParameters", ()) for parameter in undeclared: p_type = parameter["parameterType"] if "arrayType" in p_type: klass = ArrayQueryParameter elif "structTypes" in p_type: klass = StructQueryParameter else: klass = ScalarQueryParameter parameters.append(klass.from_api_repr(parameter)) return parameters @property def estimated_bytes_processed(self): """Return the estimated number of bytes processed by the query. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.estimated_bytes_processed Returns: Optional[int]: number of DML rows affected by the job, or None if job is not yet complete. """ result = self._job_statistics().get("estimatedBytesProcessed") if result is not None: result = int(result) return result @property def dml_stats(self) -> Optional[DmlStats]: stats = self._job_statistics().get("dmlStats") if stats is None: return None else: return DmlStats.from_api_repr(stats) @property def bi_engine_stats(self) -> Optional[BiEngineStats]: stats = self._job_statistics().get("biEngineStatistics") if stats is None: return None else: return BiEngineStats.from_api_repr(stats) def _blocking_poll(self, timeout=None, **kwargs): self._done_timeout = timeout self._transport_timeout = timeout super(QueryJob, self)._blocking_poll(timeout=timeout, **kwargs) @staticmethod def _format_for_exception(message: str, query: str): """Format a query for the output in exception message. Args: message (str): The original exception message. query (str): The SQL query to format. Returns: str: A formatted query text. """ template = "{message}\n\n{header}\n\n{ruler}\n{body}\n{ruler}" lines = query.splitlines() if query is not None else [""] max_line_len = max(len(line) for line in lines) header = "-----Query Job SQL Follows-----" header = "{:^{total_width}}".format(header, total_width=max_line_len + 5) # Print out a "ruler" above and below the SQL so we can judge columns. # Left pad for the line numbers (4 digits plus ":"). ruler = " |" + " . |" * (max_line_len // 10) # Put line numbers next to the SQL. body = "\n".join( "{:4}:{}".format(n, line) for n, line in enumerate(lines, start=1) ) return template.format(message=message, header=header, ruler=ruler, body=body) def _begin(self, client=None, retry=DEFAULT_RETRY, timeout=None): """API call: begin the job via a POST request See https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/insert Args: client (Optional[google.cloud.bigquery.client.Client]): The client to use. If not passed, falls back to the ``client`` associated with the job object or``NoneType``. retry (Optional[google.api_core.retry.Retry]): How to retry the RPC. timeout (Optional[float]): The number of seconds to wait for the underlying HTTP transport before using ``retry``. Raises: ValueError: If the job has already begun. """ try: super(QueryJob, self)._begin(client=client, retry=retry, timeout=timeout) except exceptions.GoogleAPICallError as exc: exc.message = _EXCEPTION_FOOTER_TEMPLATE.format( message=exc.message, location=self.location, job_id=self.job_id ) exc.debug_message = self._format_for_exception(exc.message, self.query) exc.query_job = self raise def _reload_query_results( self, retry: "retries.Retry" = DEFAULT_RETRY, timeout: Optional[float] = None, page_size: int = 0, ): """Refresh the cached query results unless already cached and complete. Args: retry (Optional[google.api_core.retry.Retry]): How to retry the call that retrieves query results. timeout (Optional[float]): The number of seconds to wait for the underlying HTTP transport before using ``retry``. page_size (int): Maximum number of rows in a single response. See maxResults in the jobs.getQueryResults REST API. """ # Optimization: avoid a call to jobs.getQueryResults if it's already # been fetched, e.g. from jobs.query first page of results. if self._query_results and self._query_results.complete: return # Since the API to getQueryResults can hang up to the timeout value # (default of 10 seconds), set the timeout parameter to ensure that # the timeout from the futures API is respected. See: # https://github.com/GoogleCloudPlatform/google-cloud-python/issues/4135 timeout_ms = None # Python_API_core, as part of a major rewrite of the deadline, timeout, # retry process sets the timeout value as a Python object(). # Our system does not natively handle that and instead expects # either None or a numeric value. If passed a Python object, convert to # None. if type(self._done_timeout) is object: # pragma: NO COVER self._done_timeout = None if self._done_timeout is not None: # pragma: NO COVER # Subtract a buffer for context switching, network latency, etc. api_timeout = self._done_timeout - _TIMEOUT_BUFFER_SECS api_timeout = max(min(api_timeout, 10), 0) self._done_timeout -= api_timeout self._done_timeout = max(0, self._done_timeout) timeout_ms = int(api_timeout * 1000) # If an explicit timeout is not given, fall back to the transport timeout # stored in _blocking_poll() in the process of polling for job completion. if timeout is not None: transport_timeout = timeout else: transport_timeout = self._transport_timeout # Handle PollingJob._DEFAULT_VALUE. if not isinstance(transport_timeout, (float, int)): transport_timeout = None self._query_results = self._client._get_query_results( self.job_id, retry, project=self.project, timeout_ms=timeout_ms, location=self.location, timeout=transport_timeout, page_size=page_size, ) def result( # type: ignore # (incompatible with supertype) self, page_size: Optional[int] = None, max_results: Optional[int] = None, retry: Optional[retries.Retry] = DEFAULT_RETRY, timeout: Optional[Union[float, object]] = POLLING_DEFAULT_VALUE, start_index: Optional[int] = None, job_retry: Optional[retries.Retry] = DEFAULT_JOB_RETRY, ) -> Union["RowIterator", _EmptyRowIterator]: """Start the job and wait for it to complete and get the result. Args: page_size (Optional[int]): The maximum number of rows in each page of results from this request. Non-positive values are ignored. max_results (Optional[int]): The maximum total number of rows from this request. retry (Optional[google.api_core.retry.Retry]): How to retry the call that retrieves rows. This only applies to making RPC calls. It isn't used to retry failed jobs. This has a reasonable default that should only be overridden with care. If the job state is ``DONE``, retrying is aborted early even if the results are not available, as this will not change anymore. timeout (Optional[Union[float, \ google.api_core.future.polling.PollingFuture._DEFAULT_VALUE, \ ]]): The number of seconds to wait for the underlying HTTP transport before using ``retry``. If ``None``, wait indefinitely unless an error is returned. If unset, only the underlying API calls have their default timeouts, but we still wait indefinitely for the job to finish. start_index (Optional[int]): The zero-based index of the starting row to read. job_retry (Optional[google.api_core.retry.Retry]): How to retry failed jobs. The default retries rate-limit-exceeded errors. Passing ``None`` disables job retry. Not all jobs can be retried. If ``job_id`` was provided to the query that created this job, then the job returned by the query will not be retryable, and an exception will be raised if non-``None`` non-default ``job_retry`` is also provided. Returns: google.cloud.bigquery.table.RowIterator: Iterator of row data :class:`~google.cloud.bigquery.table.Row`-s. During each page, the iterator will have the ``total_rows`` attribute set, which counts the total number of rows **in the result set** (this is distinct from the total number of rows in the current page: ``iterator.page.num_items``). If the query is a special query that produces no results, e.g. a DDL query, an ``_EmptyRowIterator`` instance is returned. Raises: google.cloud.exceptions.GoogleAPICallError: If the job failed and retries aren't successful. concurrent.futures.TimeoutError: If the job did not complete in the given timeout. TypeError: If Non-``None`` and non-default ``job_retry`` is provided and the job is not retryable. """ # Note: Since waiting for a query job to finish is more complex than # refreshing the job state in a loop, we avoid calling the superclass # in this method. if self.dry_run: return _EmptyRowIterator( project=self.project, location=self.location, # Intentionally omit job_id and query_id since this doesn't # actually correspond to a finished query job. ) # Setting max_results should be equivalent to setting page_size with # regards to allowing the user to tune how many results to download # while we wait for the query to finish. See internal issue: # 344008814. But if start_index is set, user is trying to access a # specific page, so we don't need to set page_size. See issue #1950. if page_size is None and max_results is not None and start_index is None: page_size = max_results # When timeout has default sentinel value ``object()``, do not pass # anything to invoke default timeouts in subsequent calls. done_kwargs: Dict[str, Union[_helpers.TimeoutType, object]] = {} reload_query_results_kwargs: Dict[str, Union[_helpers.TimeoutType, object]] = {} list_rows_kwargs: Dict[str, Union[_helpers.TimeoutType, object]] = {} if type(timeout) is not object: done_kwargs["timeout"] = timeout list_rows_kwargs["timeout"] = timeout reload_query_results_kwargs["timeout"] = timeout if page_size is not None: reload_query_results_kwargs["page_size"] = page_size try: retry_do_query = getattr(self, "_retry_do_query", None) if retry_do_query is not None: if job_retry is DEFAULT_JOB_RETRY: job_retry = self._job_retry # type: ignore else: if job_retry is not None and job_retry is not DEFAULT_JOB_RETRY: raise TypeError( "`job_retry` was provided, but this job is" " not retryable, because a custom `job_id` was" " provided to the query that created this job." ) restart_query_job = False def is_job_done(): nonlocal restart_query_job if restart_query_job: restart_query_job = False # The original job has failed. Create a new one. # # Note that we won't get here if retry_do_query is # None, because we won't use a retry. job = retry_do_query() # Become the new job: self.__dict__.clear() self.__dict__.update(job.__dict__) # It's possible the job fails again and we'll have to # retry that too. self._retry_do_query = retry_do_query self._job_retry = job_retry # If the job hasn't been created, create it now. Related: # https://github.com/googleapis/python-bigquery/issues/1940 if self.state is None: self._begin(retry=retry, **done_kwargs) # Refresh the job status with jobs.get because some of the # exceptions thrown by jobs.getQueryResults like timeout and # rateLimitExceeded errors are ambiguous. We want to know if # the query job failed and not just the call to # jobs.getQueryResults. if self.done(retry=retry, **done_kwargs): # If it's already failed, we might as well stop. job_failed_exception = self.exception() if job_failed_exception is not None: # Only try to restart the query job if the job failed for # a retriable reason. For example, don't restart the query # if the call to reload the job metadata within self.done() # timed out. # # The `restart_query_job` must only be called after a # successful call to the `jobs.get` REST API and we # determine that the job has failed. # # The `jobs.get` REST API # (https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/get) # is called via `self.done()` which calls # `self.reload()`. # # To determine if the job failed, the `self.exception()` # is set from `self.reload()` via # `self._set_properties()`, which translates the # `Job.status.errorResult` field # (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatus.FIELDS.error_result) # into an exception that can be processed by the # `job_retry` predicate. restart_query_job = True raise job_failed_exception else: # Make sure that the _query_results are cached so we # can return a complete RowIterator. # # Note: As an optimization, _reload_query_results # doesn't make any API calls if the query results are # already cached and have jobComplete=True in the # response from the REST API. This ensures we aren't # making any extra API calls if the previous loop # iteration fetched the finished job. self._reload_query_results( retry=retry, **reload_query_results_kwargs ) return True # Call jobs.getQueryResults with max results set to 0 just to # wait for the query to finish. Unlike most methods, # jobs.getQueryResults hangs as long as it can to ensure we # know when the query has finished as soon as possible. self._reload_query_results(retry=retry, **reload_query_results_kwargs) # Even if the query is finished now according to # jobs.getQueryResults, we'll want to reload the job status if # it's not already DONE. return False if retry_do_query is not None and job_retry is not None: is_job_done = job_retry(is_job_done) # timeout can be a number of seconds, `None`, or a # `google.api_core.future.polling.PollingFuture._DEFAULT_VALUE` # sentinel object indicating a default timeout if we choose to add # one some day. This value can come from our PollingFuture # superclass and was introduced in # https://github.com/googleapis/python-api-core/pull/462. if isinstance(timeout, (float, int)): remaining_timeout = timeout else: # Note: we may need to handle _DEFAULT_VALUE as a separate # case someday, but even then the best we can do for queries # is 72+ hours for hyperparameter tuning jobs: # https://cloud.google.com/bigquery/quotas#query_jobs # # The timeout for a multi-statement query is 24+ hours. See: # https://cloud.google.com/bigquery/quotas#multi_statement_query_limits remaining_timeout = None if remaining_timeout is None: # Since is_job_done() calls jobs.getQueryResults, which is a # long-running API, don't delay the next request at all. while not is_job_done(): pass else: # Use a monotonic clock since we don't actually care about # daylight savings or similar, just the elapsed time. previous_time = time.monotonic() while not is_job_done(): current_time = time.monotonic() elapsed_time = current_time - previous_time remaining_timeout = remaining_timeout - elapsed_time previous_time = current_time if remaining_timeout < 0: raise concurrent.futures.TimeoutError() except exceptions.GoogleAPICallError as exc: exc.message = _EXCEPTION_FOOTER_TEMPLATE.format( message=exc.message, location=self.location, job_id=self.job_id ) exc.debug_message = self._format_for_exception(exc.message, self.query) # type: ignore exc.query_job = self # type: ignore raise except requests.exceptions.Timeout as exc: raise concurrent.futures.TimeoutError from exc # If the query job is complete but there are no query results, this was # special job, such as a DDL query. Return an empty result set to # indicate success and avoid calling tabledata.list on a table which # can't be read (such as a view table). if self._query_results.total_rows is None: return _EmptyRowIterator( location=self.location, project=self.project, job_id=self.job_id, query_id=self.query_id, num_dml_affected_rows=self._query_results.num_dml_affected_rows, ) # We know that there's at least 1 row, so only treat the response from # jobs.getQueryResults / jobs.query as the first page of the # RowIterator response if there are any rows in it. This prevents us # from stopping the iteration early in the cases where we set # maxResults=0. In that case, we're missing rows and there's no next # page token. first_page_response = self._query_results._properties if "rows" not in first_page_response: first_page_response = None rows = self._client._list_rows_from_query_results( self.job_id, self.location, self.project, self._query_results.schema, total_rows=self._query_results.total_rows, destination=self.destination, page_size=page_size, max_results=max_results, start_index=start_index, retry=retry, query_id=self.query_id, first_page_response=first_page_response, num_dml_affected_rows=self._query_results.num_dml_affected_rows, query=self.query, total_bytes_processed=self.total_bytes_processed, **list_rows_kwargs, ) rows._preserve_order = _contains_order_by(self.query) return rows # If changing the signature of this method, make sure to apply the same # changes to table.RowIterator.to_arrow(), except for the max_results parameter # that should only exist here in the QueryJob method. def to_arrow( self, progress_bar_type: Optional[str] = None, bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, create_bqstorage_client: bool = True, max_results: Optional[int] = None, ) -> "pyarrow.Table": """[Beta] Create a class:`pyarrow.Table` by loading all pages of a table or query. Args: progress_bar_type (Optional[str]): If set, use the `tqdm `_ library to display a progress bar while the data downloads. Install the ``tqdm`` package to use this feature. Possible values of ``progress_bar_type`` include: ``None`` No progress bar. ``'tqdm'`` Use the :func:`tqdm.tqdm` function to print a progress bar to :data:`sys.stdout`. ``'tqdm_notebook'`` Use the :func:`tqdm.notebook.tqdm` function to display a progress bar as a Jupyter notebook widget. ``'tqdm_gui'`` Use the :func:`tqdm.tqdm_gui` function to display a progress bar as a graphical dialog box. bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]): A BigQuery Storage API client. If supplied, use the faster BigQuery Storage API to fetch rows from BigQuery. This API is a billable API. This method requires ``google-cloud-bigquery-storage`` library. Reading from a specific partition or snapshot is not currently supported by this method. create_bqstorage_client (Optional[bool]): If ``True`` (default), create a BigQuery Storage API client using the default API settings. The BigQuery Storage API is a faster way to fetch rows from BigQuery. See the ``bqstorage_client`` parameter for more information. This argument does nothing if ``bqstorage_client`` is supplied. .. versionadded:: 1.24.0 max_results (Optional[int]): Maximum number of rows to include in the result. No limit by default. .. versionadded:: 2.21.0 Returns: pyarrow.Table A :class:`pyarrow.Table` populated with row data and column headers from the query results. The column headers are derived from the destination table's schema. Raises: ValueError: If the :mod:`pyarrow` library cannot be imported. .. versionadded:: 1.17.0 """ query_result = wait_for_query(self, progress_bar_type, max_results=max_results) return query_result.to_arrow( progress_bar_type=progress_bar_type, bqstorage_client=bqstorage_client, create_bqstorage_client=create_bqstorage_client, ) # If changing the signature of this method, make sure to apply the same # changes to table.RowIterator.to_dataframe(), except for the max_results parameter # that should only exist here in the QueryJob method. def to_dataframe( self, bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, dtypes: Optional[Dict[str, Any]] = None, progress_bar_type: Optional[str] = None, create_bqstorage_client: bool = True, max_results: Optional[int] = None, geography_as_object: bool = False, bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE, int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, float_dtype: Union[Any, None] = None, string_dtype: Union[Any, None] = None, date_dtype: Union[Any, None] = DefaultPandasDTypes.DATE_DTYPE, datetime_dtype: Union[Any, None] = None, time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE, timestamp_dtype: Union[Any, None] = None, range_date_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_DATE_DTYPE, range_datetime_dtype: Union[ Any, None ] = DefaultPandasDTypes.RANGE_DATETIME_DTYPE, range_timestamp_dtype: Union[ Any, None ] = DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE, ) -> "pandas.DataFrame": """Return a pandas DataFrame from a QueryJob Args: bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]): A BigQuery Storage API client. If supplied, use the faster BigQuery Storage API to fetch rows from BigQuery. This API is a billable API. This method requires the ``fastavro`` and ``google-cloud-bigquery-storage`` libraries. Reading from a specific partition or snapshot is not currently supported by this method. dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]): A dictionary of column names pandas ``dtype``s. The provided ``dtype`` is used when constructing the series for the column specified. Otherwise, the default pandas behavior is used. progress_bar_type (Optional[str]): If set, use the `tqdm `_ library to display a progress bar while the data downloads. Install the ``tqdm`` package to use this feature. See :func:`~google.cloud.bigquery.table.RowIterator.to_dataframe` for details. .. versionadded:: 1.11.0 create_bqstorage_client (Optional[bool]): If ``True`` (default), create a BigQuery Storage API client using the default API settings. The BigQuery Storage API is a faster way to fetch rows from BigQuery. See the ``bqstorage_client`` parameter for more information. This argument does nothing if ``bqstorage_client`` is supplied. .. versionadded:: 1.24.0 max_results (Optional[int]): Maximum number of rows to include in the result. No limit by default. .. versionadded:: 2.21.0 geography_as_object (Optional[bool]): If ``True``, convert GEOGRAPHY data to :mod:`shapely` geometry objects. If ``False`` (default), don't cast geography data to :mod:`shapely` geometry objects. .. versionadded:: 2.24.0 bool_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``) to convert BigQuery Boolean type, instead of relying on the default ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``, then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type .. versionadded:: 3.8.0 int_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``) to convert BigQuery Integer types, instead of relying on the default ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``, then the data type will be ``numpy.dtype("int64")``. A list of BigQuery Integer types can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types .. versionadded:: 3.8.0 float_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``) to convert BigQuery Float type, instead of relying on the default ``numpy.dtype("float64")``. If you explicitly set the value to ``None``, then the data type will be ``numpy.dtype("float64")``. BigQuery Float type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types .. versionadded:: 3.8.0 string_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to convert BigQuery String type, instead of relying on the default ``numpy.dtype("object")``. If you explicitly set the value to ``None``, then the data type will be ``numpy.dtype("object")``. BigQuery String type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type .. versionadded:: 3.8.0 date_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.ArrowDtype(pyarrow.date32())``) to convert BigQuery Date type, instead of relying on the default ``db_dtypes.DateDtype()``. If you explicitly set the value to ``None``, then the data type will be ``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery Date type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#date_type .. versionadded:: 3.10.0 datetime_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.ArrowDtype(pyarrow.timestamp("us"))``) to convert BigQuery Datetime type, instead of relying on the default ``numpy.dtype("datetime64[ns]``. If you explicitly set the value to ``None``, then the data type will be ``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery Datetime type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#datetime_type .. versionadded:: 3.10.0 time_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.ArrowDtype(pyarrow.time64("us"))``) to convert BigQuery Time type, instead of relying on the default ``db_dtypes.TimeDtype()``. If you explicitly set the value to ``None``, then the data type will be ``numpy.dtype("object")``. BigQuery Time type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#time_type .. versionadded:: 3.10.0 timestamp_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC"))``) to convert BigQuery Timestamp type, instead of relying on the default ``numpy.dtype("datetime64[ns, UTC]")``. If you explicitly set the value to ``None``, then the data type will be ``numpy.dtype("datetime64[ns, UTC]")`` or ``object`` if out of bound. BigQuery Datetime type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp_type .. versionadded:: 3.10.0 range_date_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: .. code-block:: python pandas.ArrowDtype(pyarrow.struct( [("start", pyarrow.date32()), ("end", pyarrow.date32())] )) to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type .. versionadded:: 3.21.0 range_datetime_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: .. code-block:: python pandas.ArrowDtype(pyarrow.struct( [ ("start", pyarrow.timestamp("us")), ("end", pyarrow.timestamp("us")), ] )) to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type .. versionadded:: 3.21.0 range_timestamp_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: .. code-block:: python pandas.ArrowDtype(pyarrow.struct( [ ("start", pyarrow.timestamp("us", tz="UTC")), ("end", pyarrow.timestamp("us", tz="UTC")), ] )) to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type .. versionadded:: 3.21.0 Returns: pandas.DataFrame: A :class:`~pandas.DataFrame` populated with row data and column headers from the query results. The column headers are derived from the destination table's schema. Raises: ValueError: If the :mod:`pandas` library cannot be imported, or the :mod:`google.cloud.bigquery_storage_v1` module is required but cannot be imported. Also if `geography_as_object` is `True`, but the :mod:`shapely` library cannot be imported. """ query_result = wait_for_query(self, progress_bar_type, max_results=max_results) return query_result.to_dataframe( bqstorage_client=bqstorage_client, dtypes=dtypes, progress_bar_type=progress_bar_type, create_bqstorage_client=create_bqstorage_client, geography_as_object=geography_as_object, bool_dtype=bool_dtype, int_dtype=int_dtype, float_dtype=float_dtype, string_dtype=string_dtype, date_dtype=date_dtype, datetime_dtype=datetime_dtype, time_dtype=time_dtype, timestamp_dtype=timestamp_dtype, range_date_dtype=range_date_dtype, range_datetime_dtype=range_datetime_dtype, range_timestamp_dtype=range_timestamp_dtype, ) # If changing the signature of this method, make sure to apply the same # changes to table.RowIterator.to_dataframe(), except for the max_results parameter # that should only exist here in the QueryJob method. def to_geodataframe( self, bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, dtypes: Optional[Dict[str, Any]] = None, progress_bar_type: Optional[str] = None, create_bqstorage_client: bool = True, max_results: Optional[int] = None, geography_column: Optional[str] = None, ) -> "geopandas.GeoDataFrame": """Return a GeoPandas GeoDataFrame from a QueryJob Args: bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]): A BigQuery Storage API client. If supplied, use the faster BigQuery Storage API to fetch rows from BigQuery. This API is a billable API. This method requires the ``fastavro`` and ``google-cloud-bigquery-storage`` libraries. Reading from a specific partition or snapshot is not currently supported by this method. dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]): A dictionary of column names pandas ``dtype``s. The provided ``dtype`` is used when constructing the series for the column specified. Otherwise, the default pandas behavior is used. progress_bar_type (Optional[str]): If set, use the `tqdm `_ library to display a progress bar while the data downloads. Install the ``tqdm`` package to use this feature. See :func:`~google.cloud.bigquery.table.RowIterator.to_dataframe` for details. .. versionadded:: 1.11.0 create_bqstorage_client (Optional[bool]): If ``True`` (default), create a BigQuery Storage API client using the default API settings. The BigQuery Storage API is a faster way to fetch rows from BigQuery. See the ``bqstorage_client`` parameter for more information. This argument does nothing if ``bqstorage_client`` is supplied. .. versionadded:: 1.24.0 max_results (Optional[int]): Maximum number of rows to include in the result. No limit by default. .. versionadded:: 2.21.0 geography_column (Optional[str]): If there are more than one GEOGRAPHY column, identifies which one to use to construct a GeoPandas GeoDataFrame. This option can be ommitted if there's only one GEOGRAPHY column. Returns: geopandas.GeoDataFrame: A :class:`geopandas.GeoDataFrame` populated with row data and column headers from the query results. The column headers are derived from the destination table's schema. Raises: ValueError: If the :mod:`geopandas` library cannot be imported, or the :mod:`google.cloud.bigquery_storage_v1` module is required but cannot be imported. .. versionadded:: 2.24.0 """ query_result = wait_for_query(self, progress_bar_type, max_results=max_results) return query_result.to_geodataframe( bqstorage_client=bqstorage_client, dtypes=dtypes, progress_bar_type=progress_bar_type, create_bqstorage_client=create_bqstorage_client, geography_column=geography_column, ) def __iter__(self): return iter(self.result()) class QueryPlanEntryStep(object): """Map a single step in a query plan entry. Args: kind (str): step type. substeps (List): names of substeps. """ def __init__(self, kind, substeps): self.kind = kind self.substeps = list(substeps) @classmethod def from_api_repr(cls, resource: dict) -> "QueryPlanEntryStep": """Factory: construct instance from the JSON repr. Args: resource (Dict): JSON representation of the entry. Returns: google.cloud.bigquery.job.QueryPlanEntryStep: New instance built from the resource. """ return cls(kind=resource.get("kind"), substeps=resource.get("substeps", ())) def __eq__(self, other): if not isinstance(other, self.__class__): return NotImplemented return self.kind == other.kind and self.substeps == other.substeps class QueryPlanEntry(object): """QueryPlanEntry represents a single stage of a query execution plan. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#ExplainQueryStage for the underlying API representation within query statistics. """ def __init__(self): self._properties = {} @classmethod def from_api_repr(cls, resource: dict) -> "QueryPlanEntry": """Factory: construct instance from the JSON repr. Args: resource(Dict[str: object]): ExplainQueryStage representation returned from API. Returns: google.cloud.bigquery.job.QueryPlanEntry: Query plan entry parsed from ``resource``. """ entry = cls() entry._properties = resource return entry @property def name(self): """Optional[str]: Human-readable name of the stage.""" return self._properties.get("name") @property def entry_id(self): """Optional[str]: Unique ID for the stage within the plan.""" return self._properties.get("id") @property def start(self): """Optional[Datetime]: Datetime when the stage started.""" if self._properties.get("startMs") is None: return None return _helpers._datetime_from_microseconds( int(self._properties.get("startMs")) * 1000.0 ) @property def end(self): """Optional[Datetime]: Datetime when the stage ended.""" if self._properties.get("endMs") is None: return None return _helpers._datetime_from_microseconds( int(self._properties.get("endMs")) * 1000.0 ) @property def input_stages(self): """List(int): Entry IDs for stages that were inputs for this stage.""" if self._properties.get("inputStages") is None: return [] return [ _helpers._int_or_none(entry) for entry in self._properties.get("inputStages") ] @property def parallel_inputs(self): """Optional[int]: Number of parallel input segments within the stage. """ return _helpers._int_or_none(self._properties.get("parallelInputs")) @property def completed_parallel_inputs(self): """Optional[int]: Number of parallel input segments completed.""" return _helpers._int_or_none(self._properties.get("completedParallelInputs")) @property def wait_ms_avg(self): """Optional[int]: Milliseconds the average worker spent waiting to be scheduled. """ return _helpers._int_or_none(self._properties.get("waitMsAvg")) @property def wait_ms_max(self): """Optional[int]: Milliseconds the slowest worker spent waiting to be scheduled. """ return _helpers._int_or_none(self._properties.get("waitMsMax")) @property def wait_ratio_avg(self): """Optional[float]: Ratio of time the average worker spent waiting to be scheduled, relative to the longest time spent by any worker in any stage of the overall plan. """ return self._properties.get("waitRatioAvg") @property def wait_ratio_max(self): """Optional[float]: Ratio of time the slowest worker spent waiting to be scheduled, relative to the longest time spent by any worker in any stage of the overall plan. """ return self._properties.get("waitRatioMax") @property def read_ms_avg(self): """Optional[int]: Milliseconds the average worker spent reading input. """ return _helpers._int_or_none(self._properties.get("readMsAvg")) @property def read_ms_max(self): """Optional[int]: Milliseconds the slowest worker spent reading input. """ return _helpers._int_or_none(self._properties.get("readMsMax")) @property def read_ratio_avg(self): """Optional[float]: Ratio of time the average worker spent reading input, relative to the longest time spent by any worker in any stage of the overall plan. """ return self._properties.get("readRatioAvg") @property def read_ratio_max(self): """Optional[float]: Ratio of time the slowest worker spent reading to be scheduled, relative to the longest time spent by any worker in any stage of the overall plan. """ return self._properties.get("readRatioMax") @property def compute_ms_avg(self): """Optional[int]: Milliseconds the average worker spent on CPU-bound processing. """ return _helpers._int_or_none(self._properties.get("computeMsAvg")) @property def compute_ms_max(self): """Optional[int]: Milliseconds the slowest worker spent on CPU-bound processing. """ return _helpers._int_or_none(self._properties.get("computeMsMax")) @property def compute_ratio_avg(self): """Optional[float]: Ratio of time the average worker spent on CPU-bound processing, relative to the longest time spent by any worker in any stage of the overall plan. """ return self._properties.get("computeRatioAvg") @property def compute_ratio_max(self): """Optional[float]: Ratio of time the slowest worker spent on CPU-bound processing, relative to the longest time spent by any worker in any stage of the overall plan. """ return self._properties.get("computeRatioMax") @property def write_ms_avg(self): """Optional[int]: Milliseconds the average worker spent writing output data. """ return _helpers._int_or_none(self._properties.get("writeMsAvg")) @property def write_ms_max(self): """Optional[int]: Milliseconds the slowest worker spent writing output data. """ return _helpers._int_or_none(self._properties.get("writeMsMax")) @property def write_ratio_avg(self): """Optional[float]: Ratio of time the average worker spent writing output data, relative to the longest time spent by any worker in any stage of the overall plan. """ return self._properties.get("writeRatioAvg") @property def write_ratio_max(self): """Optional[float]: Ratio of time the slowest worker spent writing output data, relative to the longest time spent by any worker in any stage of the overall plan. """ return self._properties.get("writeRatioMax") @property def records_read(self): """Optional[int]: Number of records read by this stage.""" return _helpers._int_or_none(self._properties.get("recordsRead")) @property def records_written(self): """Optional[int]: Number of records written by this stage.""" return _helpers._int_or_none(self._properties.get("recordsWritten")) @property def status(self): """Optional[str]: status of this stage.""" return self._properties.get("status") @property def shuffle_output_bytes(self): """Optional[int]: Number of bytes written by this stage to intermediate shuffle. """ return _helpers._int_or_none(self._properties.get("shuffleOutputBytes")) @property def shuffle_output_bytes_spilled(self): """Optional[int]: Number of bytes written by this stage to intermediate shuffle and spilled to disk. """ return _helpers._int_or_none(self._properties.get("shuffleOutputBytesSpilled")) @property def steps(self): """List(QueryPlanEntryStep): List of step operations performed by each worker in the stage. """ return [ QueryPlanEntryStep.from_api_repr(step) for step in self._properties.get("steps", []) ] @property def slot_ms(self): """Optional[int]: Slot-milliseconds used by the stage.""" return _helpers._int_or_none(self._properties.get("slotMs")) class TimelineEntry(object): """TimelineEntry represents progress of a query job at a particular point in time. See https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#querytimelinesample for the underlying API representation within query statistics. """ def __init__(self): self._properties = {} @classmethod def from_api_repr(cls, resource): """Factory: construct instance from the JSON repr. Args: resource(Dict[str: object]): QueryTimelineSample representation returned from API. Returns: google.cloud.bigquery.TimelineEntry: Timeline sample parsed from ``resource``. """ entry = cls() entry._properties = resource return entry @property def elapsed_ms(self): """Optional[int]: Milliseconds elapsed since start of query execution.""" return _helpers._int_or_none(self._properties.get("elapsedMs")) @property def active_units(self): """Optional[int]: Current number of input units being processed by workers, reported as largest value since the last sample.""" return _helpers._int_or_none(self._properties.get("activeUnits")) @property def pending_units(self): """Optional[int]: Current number of input units remaining for query stages active at this sample time.""" return _helpers._int_or_none(self._properties.get("pendingUnits")) @property def completed_units(self): """Optional[int]: Current number of input units completed by this query.""" return _helpers._int_or_none(self._properties.get("completedUnits")) @property def slot_millis(self): """Optional[int]: Cumulative slot-milliseconds consumed by this query.""" return _helpers._int_or_none(self._properties.get("totalSlotMs"))