structure saas with tools

This commit is contained in:
Davidson Gomes
2025-04-25 15:30:54 -03:00
commit 1aef473937
16434 changed files with 6584257 additions and 0 deletions

View File

@@ -0,0 +1,20 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from google.cloud.bigquery.magics.magics import context
# For backwards compatibility we need to make the context available in the path
# google.cloud.bigquery.magics.context
__all__ = ("context",)

View File

@@ -0,0 +1,34 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from google.cloud.bigquery.magics.line_arg_parser.exceptions import ParseError
from google.cloud.bigquery.magics.line_arg_parser.exceptions import (
DuplicateQueryParamsError,
QueryParamsParseError,
)
from google.cloud.bigquery.magics.line_arg_parser.lexer import Lexer
from google.cloud.bigquery.magics.line_arg_parser.lexer import TokenType
from google.cloud.bigquery.magics.line_arg_parser.parser import Parser
from google.cloud.bigquery.magics.line_arg_parser.visitors import QueryParamsExtractor
__all__ = (
"DuplicateQueryParamsError",
"Lexer",
"Parser",
"ParseError",
"QueryParamsExtractor",
"QueryParamsParseError",
"TokenType",
)

View File

@@ -0,0 +1,25 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class ParseError(Exception):
pass
class QueryParamsParseError(ParseError):
"""Raised when --params option is syntactically incorrect."""
class DuplicateQueryParamsError(ParseError):
pass

View File

@@ -0,0 +1,200 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import namedtuple
from collections import OrderedDict
import itertools
import re
import enum
Token = namedtuple("Token", ("type_", "lexeme", "pos"))
StateTransition = namedtuple("StateTransition", ("new_state", "total_offset"))
# Pattern matching is done with regexes, and the order in which the token patterns are
# defined is important.
#
# Suppose we had the following token definitions:
# * INT - a token matching integers,
# * FLOAT - a token matching floating point numbers,
# * DOT - a token matching a single literal dot character, i.e. "."
#
# The FLOAT token would have to be defined first, since we would want the input "1.23"
# to be tokenized as a single FLOAT token, and *not* three tokens (INT, DOT, INT).
#
# Sometimes, however, different tokens match too similar patterns, and it is not
# possible to define them in order that would avoid any ambiguity. One such case are
# the OPT_VAL and PY_NUMBER tokens, as both can match an integer literal, say "42".
#
# In order to avoid the dilemmas, the lexer implements a concept of STATES. States are
# used to split token definitions into subgroups, and in each lexer state only a single
# subgroup is used for tokenizing the input. Lexer states can therefore be though of as
# token namespaces.
#
# For example, while parsing the value of the "--params" option, we do not want to
# "recognize" it as a single OPT_VAL token, but instead want to parse it as a Python
# dictionary and verify its syntactial correctness. On the other hand, while parsing
# the value of an option other than "--params", we do not really care about its
# structure, and thus do not want to use any of the "Python tokens" for pattern matching.
#
# Token definition order is important, thus an OrderedDict is used. In addition, PEP 468
# guarantees us that the order of kwargs is preserved in Python 3.6+.
token_types = OrderedDict(
state_parse_pos_args=OrderedDict(
GOTO_PARSE_NON_PARAMS_OPTIONS=r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--))", # double dash - starting the options list
DEST_VAR=r"(?P<DEST_VAR>[^\d\W]\w*)", # essentially a Python ID
),
state_parse_non_params_options=OrderedDict(
GOTO_PARSE_PARAMS_OPTION=r"(?P<GOTO_PARSE_PARAMS_OPTION>(?=--params(?:\s|=|--|$)))", # the --params option
OPTION_SPEC=r"(?P<OPTION_SPEC>--\w+)",
OPTION_EQ=r"(?P<OPTION_EQ>=)",
OPT_VAL=r"(?P<OPT_VAL>\S+?(?=\s|--|$))",
),
state_parse_params_option=OrderedDict(
PY_STRING=r"(?P<PY_STRING>(?:{})|(?:{}))".format( # single and double quoted strings
r"'(?:[^'\\]|\.)*'", r'"(?:[^"\\]|\.)*"'
),
PARAMS_OPT_SPEC=r"(?P<PARAMS_OPT_SPEC>--params(?=\s|=|--|$))",
PARAMS_OPT_EQ=r"(?P<PARAMS_OPT_EQ>=)",
GOTO_PARSE_NON_PARAMS_OPTIONS=r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--\w+))", # found another option spec
PY_BOOL=r"(?P<PY_BOOL>True|False)",
DOLLAR_PY_ID=r"(?P<DOLLAR_PY_ID>\$[^\d\W]\w*)",
PY_NUMBER=r"(?P<PY_NUMBER>-?[1-9]\d*(?:\.\d+)?(:?[e|E][+-]?\d+)?)",
SQUOTE=r"(?P<SQUOTE>')",
DQUOTE=r'(?P<DQUOTE>")',
COLON=r"(?P<COLON>:)",
COMMA=r"(?P<COMMA>,)",
LCURL=r"(?P<LCURL>\{)",
RCURL=r"(?P<RCURL>})",
LSQUARE=r"(?P<LSQUARE>\[)",
RSQUARE=r"(?P<RSQUARE>])",
LPAREN=r"(?P<LPAREN>\()",
RPAREN=r"(?P<RPAREN>\))",
),
common=OrderedDict(
WS=r"(?P<WS>\s+)",
EOL=r"(?P<EOL>$)",
UNKNOWN=r"(?P<UNKNOWN>\S+)", # anything not a whitespace or matched by something else
),
)
class AutoStrEnum(str, enum.Enum):
"""Base enum class for for name=value str enums."""
def _generate_next_value_(name, start, count, last_values):
return name
TokenType = AutoStrEnum( # type: ignore # pytype: disable=wrong-arg-types
"TokenType",
[
(name, enum.auto())
for name in itertools.chain.from_iterable(token_types.values())
if not name.startswith("GOTO_")
],
)
class LexerState(AutoStrEnum):
PARSE_POS_ARGS = enum.auto() # parsing positional arguments
PARSE_NON_PARAMS_OPTIONS = enum.auto() # parsing options other than "--params"
PARSE_PARAMS_OPTION = enum.auto() # parsing the "--params" option
STATE_END = enum.auto()
class Lexer(object):
"""Lexical analyzer for tokenizing the cell magic input line."""
_GRAND_PATTERNS = {
LexerState.PARSE_POS_ARGS: re.compile(
"|".join(
itertools.chain(
token_types["state_parse_pos_args"].values(),
token_types["common"].values(),
)
)
),
LexerState.PARSE_NON_PARAMS_OPTIONS: re.compile(
"|".join(
itertools.chain(
token_types["state_parse_non_params_options"].values(),
token_types["common"].values(),
)
)
),
LexerState.PARSE_PARAMS_OPTION: re.compile(
"|".join(
itertools.chain(
token_types["state_parse_params_option"].values(),
token_types["common"].values(),
)
)
),
}
def __init__(self, input_text):
self._text = input_text
def __iter__(self):
# Since re.scanner does not seem to support manipulating inner scanner states,
# we need to implement lexer state transitions manually using special
# non-capturing lookahead token patterns to signal when a state transition
# should be made.
# Since we don't have "nested" states, we don't really need a stack and
# this simple mechanism is sufficient.
state = LexerState.PARSE_POS_ARGS
offset = 0 # the number of characters processed so far
while state != LexerState.STATE_END:
token_stream = self._find_state_tokens(state, offset)
for maybe_token in token_stream: # pragma: NO COVER
if isinstance(maybe_token, StateTransition):
state = maybe_token.new_state
offset = maybe_token.total_offset
break
if maybe_token.type_ != TokenType.WS:
yield maybe_token
if maybe_token.type_ == TokenType.EOL:
state = LexerState.STATE_END
break
def _find_state_tokens(self, state, current_offset):
"""Scan the input for current state's tokens starting at ``current_offset``.
Args:
state (LexerState): The current lexer state.
current_offset (int): The offset in the input text, i.e. the number
of characters already scanned so far.
Yields:
The next ``Token`` or ``StateTransition`` instance.
"""
pattern = self._GRAND_PATTERNS[state]
scanner = pattern.finditer(self._text, current_offset)
for match in scanner: # pragma: NO COVER
token_type = match.lastgroup
if token_type.startswith("GOTO_"):
yield StateTransition(
new_state=getattr(LexerState, token_type[5:]), # w/o "GOTO_" prefix
total_offset=match.start(),
)
yield Token(token_type, match.group(), match.start())

View File

@@ -0,0 +1,484 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from google.cloud.bigquery.magics.line_arg_parser import DuplicateQueryParamsError
from google.cloud.bigquery.magics.line_arg_parser import ParseError
from google.cloud.bigquery.magics.line_arg_parser import QueryParamsParseError
from google.cloud.bigquery.magics.line_arg_parser import TokenType
class ParseNode(object):
"""A base class for nodes in the input parsed to an abstract syntax tree."""
class InputLine(ParseNode):
def __init__(self, destination_var, option_list):
self.destination_var = destination_var
self.option_list = option_list
class DestinationVar(ParseNode):
def __init__(self, token):
# token type is DEST_VAR
self.token = token
self.name = token.lexeme if token is not None else None
class CmdOptionList(ParseNode):
def __init__(self, option_nodes):
self.options = [node for node in option_nodes] # shallow copy
class CmdOption(ParseNode):
def __init__(self, name, value):
self.name = name # string
self.value = value # CmdOptionValue node
class ParamsOption(CmdOption):
def __init__(self, value):
super(ParamsOption, self).__init__("params", value)
class CmdOptionValue(ParseNode):
def __init__(self, token):
# token type is OPT_VAL
self.token = token
self.value = token.lexeme
class PyVarExpansion(ParseNode):
def __init__(self, token):
self.token = token
self.raw_value = token.lexeme
class PyDict(ParseNode):
def __init__(self, dict_items):
self.items = [item for item in dict_items] # shallow copy
class PyDictItem(ParseNode):
def __init__(self, key, value):
self.key = key
self.value = value
class PyDictKey(ParseNode):
def __init__(self, token):
self.token = token
self.key_value = token.lexeme
class PyScalarValue(ParseNode):
def __init__(self, token, raw_value):
self.token = token
self.raw_value = raw_value
class PyTuple(ParseNode):
def __init__(self, tuple_items):
self.items = [item for item in tuple_items] # shallow copy
class PyList(ParseNode):
def __init__(self, list_items):
self.items = [item for item in list_items] # shallow copy
class Parser(object):
"""Parser for the tokenized cell magic input line.
The parser recognizes a simplified subset of Python grammar, specifically
a dictionary representation in typical use cases when the "--params" option
is used with the %%bigquery cell magic.
The grammar (terminal symbols are CAPITALIZED):
input_line : destination_var option_list
destination_var : DEST_VAR | EMPTY
option_list : (OPTION_SPEC [OPTION_EQ] option_value)*
(params_option | EMPTY)
(OPTION_SPEC [OPTION_EQ] option_value)*
option_value : OPT_VAL | EMPTY
# DOLLAR_PY_ID can occur if a variable passed to --params does not exist
# and is thus not expanded to a dict.
params_option : PARAMS_OPT_SPEC [PARAMS_OPT_EQ] \
(DOLLAR_PY_ID | PY_STRING | py_dict)
py_dict : LCURL dict_items RCURL
dict_items : dict_item | (dict_item COMMA dict_items)
dict_item : (dict_key COLON py_value) | EMPTY
# dict items are actually @parameter names in the cell body (i.e. the query),
# thus restricting them to strings.
dict_key : PY_STRING
py_value : PY_BOOL
| PY_NUMBER
| PY_STRING
| py_tuple
| py_list
| py_dict
py_tuple : LPAREN collection_items RPAREN
py_list : LSQUARE collection_items RSQUARE
collection_items : collection_item | (collection_item COMMA collection_items)
collection_item : py_value | EMPTY
Args:
lexer (line_arg_parser.lexer.Lexer):
An iterable producing a tokenized cell magic argument line.
"""
def __init__(self, lexer):
self._lexer = lexer
self._tokens_iter = iter(self._lexer)
self.get_next_token()
def get_next_token(self):
"""Obtain the next token from the token stream and store it as current."""
token = next(self._tokens_iter)
self._current_token = token
def consume(self, expected_type, exc_type=ParseError):
"""Move to the next token in token stream if it matches the expected type.
Args:
expected_type (lexer.TokenType): The expected token type to be consumed.
exc_type (Optional[ParseError]): The type of the exception to raise. Should be
the ``ParseError`` class or one of its subclasses. Defaults to
``ParseError``.
Raises:
ParseError: If the current token does not match the expected type.
"""
if self._current_token.type_ == expected_type:
if expected_type != TokenType.EOL:
self.get_next_token()
else:
if self._current_token.type_ == TokenType.EOL:
msg = "Unexpected end of input, expected {}.".format(expected_type)
else:
msg = "Expected token type {}, but found {} at position {}.".format(
expected_type, self._current_token.lexeme, self._current_token.pos
)
self.error(message=msg, exc_type=exc_type)
def error(self, message="Syntax error.", exc_type=ParseError):
"""Raise an error with the given message.
Args:
expected_type (lexer.TokenType): The expected token type to be consumed.
exc_type (Optional[ParseError]): The type of the exception to raise. Should be
the ``ParseError`` class or one of its subclasses. Defaults to
``ParseError``.
Raises:
ParseError: If the current token does not match the expected type.
"""
raise exc_type(message)
def input_line(self):
"""The top level method for parsing the cell magic arguments line.
Implements the following grammar production rule:
input_line : destination_var option_list
"""
dest_var = self.destination_var()
options = self.option_list()
token = self._current_token
if token.type_ != TokenType.EOL:
msg = "Unexpected input at position {}: {}".format(token.pos, token.lexeme)
self.error(msg)
return InputLine(dest_var, options)
def destination_var(self):
"""Implementation of the ``destination_var`` grammar production rule.
Production:
destination_var : DEST_VAR | EMPTY
"""
token = self._current_token
if token.type_ == TokenType.DEST_VAR:
self.consume(TokenType.DEST_VAR)
result = DestinationVar(token)
elif token.type_ == TokenType.UNKNOWN:
msg = "Unknown input at position {}: {}".format(token.pos, token.lexeme)
self.error(msg)
else:
result = DestinationVar(None)
return result
def option_list(self):
"""Implementation of the ``option_list`` grammar production rule.
Production:
option_list : (OPTION_SPEC [OPTION_EQ] option_value)*
(params_option | EMPTY)
(OPTION_SPEC [OPTION_EQ] option_value)*
"""
all_options = []
def parse_nonparams_options():
while self._current_token.type_ == TokenType.OPTION_SPEC:
token = self._current_token
self.consume(TokenType.OPTION_SPEC)
opt_name = token.lexeme[2:] # cut off the "--" prefix
# skip the optional "=" character
if self._current_token.type_ == TokenType.OPTION_EQ:
self.consume(TokenType.OPTION_EQ)
opt_value = self.option_value()
option = CmdOption(opt_name, opt_value)
all_options.append(option)
parse_nonparams_options()
token = self._current_token
if token.type_ == TokenType.PARAMS_OPT_SPEC:
option = self.params_option()
all_options.append(option)
parse_nonparams_options()
if self._current_token.type_ == TokenType.PARAMS_OPT_SPEC:
self.error(
message="Duplicate --params option", exc_type=DuplicateQueryParamsError
)
return CmdOptionList(all_options)
def option_value(self):
"""Implementation of the ``option_value`` grammar production rule.
Production:
option_value : OPT_VAL | EMPTY
"""
token = self._current_token
if token.type_ == TokenType.OPT_VAL:
self.consume(TokenType.OPT_VAL)
result = CmdOptionValue(token)
elif token.type_ == TokenType.UNKNOWN:
msg = "Unknown input at position {}: {}".format(token.pos, token.lexeme)
self.error(msg)
else:
result = None
return result
def params_option(self):
"""Implementation of the ``params_option`` grammar production rule.
Production:
params_option : PARAMS_OPT_SPEC [PARAMS_OPT_EQ] \
(DOLLAR_PY_ID | PY_STRING | py_dict)
"""
self.consume(TokenType.PARAMS_OPT_SPEC)
# skip the optional "=" character
if self._current_token.type_ == TokenType.PARAMS_OPT_EQ:
self.consume(TokenType.PARAMS_OPT_EQ)
if self._current_token.type_ == TokenType.DOLLAR_PY_ID:
token = self._current_token
self.consume(TokenType.DOLLAR_PY_ID)
opt_value = PyVarExpansion(token)
elif self._current_token.type_ == TokenType.PY_STRING:
token = self._current_token
self.consume(TokenType.PY_STRING, exc_type=QueryParamsParseError)
opt_value = PyScalarValue(token, token.lexeme)
else:
opt_value = self.py_dict()
result = ParamsOption(opt_value)
return result
def py_dict(self):
"""Implementation of the ``py_dict`` grammar production rule.
Production:
py_dict : LCURL dict_items RCURL
"""
self.consume(TokenType.LCURL, exc_type=QueryParamsParseError)
dict_items = self.dict_items()
self.consume(TokenType.RCURL, exc_type=QueryParamsParseError)
return PyDict(dict_items)
def dict_items(self):
"""Implementation of the ``dict_items`` grammar production rule.
Production:
dict_items : dict_item | (dict_item COMMA dict_items)
"""
result = []
item = self.dict_item()
if item is not None:
result.append(item)
while self._current_token.type_ == TokenType.COMMA:
self.consume(TokenType.COMMA, exc_type=QueryParamsParseError)
item = self.dict_item()
if item is not None:
result.append(item)
return result
def dict_item(self):
"""Implementation of the ``dict_item`` grammar production rule.
Production:
dict_item : (dict_key COLON py_value) | EMPTY
"""
token = self._current_token
if token.type_ == TokenType.PY_STRING:
key = self.dict_key()
self.consume(TokenType.COLON, exc_type=QueryParamsParseError)
value = self.py_value()
result = PyDictItem(key, value)
elif token.type_ == TokenType.UNKNOWN:
msg = "Unknown input at position {}: {}".format(token.pos, token.lexeme)
self.error(msg, exc_type=QueryParamsParseError)
else:
result = None
return result
def dict_key(self):
"""Implementation of the ``dict_key`` grammar production rule.
Production:
dict_key : PY_STRING
"""
token = self._current_token
self.consume(TokenType.PY_STRING, exc_type=QueryParamsParseError)
return PyDictKey(token)
def py_value(self):
"""Implementation of the ``py_value`` grammar production rule.
Production:
py_value : PY_BOOL | PY_NUMBER | PY_STRING | py_tuple | py_list | py_dict
"""
token = self._current_token
if token.type_ == TokenType.PY_BOOL:
self.consume(TokenType.PY_BOOL, exc_type=QueryParamsParseError)
return PyScalarValue(token, token.lexeme)
elif token.type_ == TokenType.PY_NUMBER:
self.consume(TokenType.PY_NUMBER, exc_type=QueryParamsParseError)
return PyScalarValue(token, token.lexeme)
elif token.type_ == TokenType.PY_STRING:
self.consume(TokenType.PY_STRING, exc_type=QueryParamsParseError)
return PyScalarValue(token, token.lexeme)
elif token.type_ == TokenType.LPAREN:
tuple_node = self.py_tuple()
return tuple_node
elif token.type_ == TokenType.LSQUARE:
list_node = self.py_list()
return list_node
elif token.type_ == TokenType.LCURL:
dict_node = self.py_dict()
return dict_node
else:
msg = "Unexpected token type {} at position {}.".format(
token.type_, token.pos
)
self.error(msg, exc_type=QueryParamsParseError)
def py_tuple(self):
"""Implementation of the ``py_tuple`` grammar production rule.
Production:
py_tuple : LPAREN collection_items RPAREN
"""
self.consume(TokenType.LPAREN, exc_type=QueryParamsParseError)
items = self.collection_items()
self.consume(TokenType.RPAREN, exc_type=QueryParamsParseError)
return PyTuple(items)
def py_list(self):
"""Implementation of the ``py_list`` grammar production rule.
Production:
py_list : LSQUARE collection_items RSQUARE
"""
self.consume(TokenType.LSQUARE, exc_type=QueryParamsParseError)
items = self.collection_items()
self.consume(TokenType.RSQUARE, exc_type=QueryParamsParseError)
return PyList(items)
def collection_items(self):
"""Implementation of the ``collection_items`` grammar production rule.
Production:
collection_items : collection_item | (collection_item COMMA collection_items)
"""
result = []
item = self.collection_item()
if item is not None:
result.append(item)
while self._current_token.type_ == TokenType.COMMA:
self.consume(TokenType.COMMA, exc_type=QueryParamsParseError)
item = self.collection_item()
if item is not None:
result.append(item)
return result
def collection_item(self):
"""Implementation of the ``collection_item`` grammar production rule.
Production:
collection_item : py_value | EMPTY
"""
if self._current_token.type_ not in {TokenType.RPAREN, TokenType.RSQUARE}:
result = self.py_value()
else:
result = None # end of list/tuple items
return result

View File

@@ -0,0 +1,159 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This module contains classes that traverse AST and convert it to something else.
If the parser successfully accepts a valid input (the bigquery cell magic arguments),
the result is an Abstract Syntax Tree (AST) that represents the input as a tree
with notes containing various useful metadata.
Node visitors can process such tree and convert it to something else that can
be used for further processing, for example:
* An optimized version of the tree with redundancy removed/simplified (not used here).
* The same tree, but with semantic errors checked, because an otherwise syntactically
valid input might still contain errors (not used here, semantic errors are detected
elsewhere).
* A form that can be directly handed to the code that operates on the input. The
``QueryParamsExtractor`` class, for instance, splits the input arguments into
the "--params <...>" part and everything else.
The "everything else" part can be then parsed by the default Jupyter argument parser,
while the --params option is processed separately by the Python evaluator.
More info on the visitor design pattern:
https://en.wikipedia.org/wiki/Visitor_pattern
"""
from __future__ import print_function
class NodeVisitor(object):
"""Base visitor class implementing the dispatch machinery."""
def visit(self, node):
method_name = "visit_{}".format(type(node).__name__)
visitor_method = getattr(self, method_name, self.method_missing)
return visitor_method(node)
def method_missing(self, node):
raise Exception("No visit_{} method".format(type(node).__name__))
class QueryParamsExtractor(NodeVisitor):
"""A visitor that extracts the "--params <...>" part from input line arguments."""
def visit_InputLine(self, node):
params_dict_parts = []
other_parts = []
dest_var_parts = self.visit(node.destination_var)
params, other_options = self.visit(node.option_list)
if dest_var_parts:
other_parts.extend(dest_var_parts)
if dest_var_parts and other_options:
other_parts.append(" ")
other_parts.extend(other_options)
params_dict_parts.extend(params)
return "".join(params_dict_parts), "".join(other_parts)
def visit_DestinationVar(self, node):
return [node.name] if node.name is not None else []
def visit_CmdOptionList(self, node):
params_opt_parts = []
other_parts = []
for i, opt in enumerate(node.options):
option_parts = self.visit(opt)
list_to_extend = params_opt_parts if opt.name == "params" else other_parts
if list_to_extend:
list_to_extend.append(" ")
list_to_extend.extend(option_parts)
return params_opt_parts, other_parts
def visit_CmdOption(self, node):
result = ["--{}".format(node.name)]
if node.value is not None:
result.append(" ")
value_parts = self.visit(node.value)
result.extend(value_parts)
return result
def visit_CmdOptionValue(self, node):
return [node.value]
def visit_ParamsOption(self, node):
value_parts = self.visit(node.value)
return value_parts
def visit_PyVarExpansion(self, node):
return [node.raw_value]
def visit_PyDict(self, node):
result = ["{"]
for i, item in enumerate(node.items):
if i > 0:
result.append(", ")
item_parts = self.visit(item)
result.extend(item_parts)
result.append("}")
return result
def visit_PyDictItem(self, node):
result = self.visit(node.key) # key parts
result.append(": ")
value_parts = self.visit(node.value)
result.extend(value_parts)
return result
def visit_PyDictKey(self, node):
return [node.key_value]
def visit_PyScalarValue(self, node):
return [node.raw_value]
def visit_PyTuple(self, node):
result = ["("]
for i, item in enumerate(node.items):
if i > 0:
result.append(", ")
item_parts = self.visit(item)
result.extend(item_parts)
result.append(")")
return result
def visit_PyList(self, node):
result = ["["]
for i, item in enumerate(node.items):
if i > 0:
result.append(", ")
item_parts = self.visit(item)
result.extend(item_parts)
result.append("]")
return result

View File

@@ -0,0 +1,776 @@
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""IPython Magics
Install ``bigquery-magics`` and call ``%load_ext bigquery_magics`` to use the
``%%bigquery`` cell magic.
See the `BigQuery Magics reference documentation
<https://googleapis.dev/python/bigquery-magics/latest/>`_.
"""
from __future__ import print_function
import re
import ast
import copy
import functools
import sys
import time
import warnings
from concurrent import futures
try:
import IPython # type: ignore
from IPython import display # type: ignore
from IPython.core import magic_arguments # type: ignore
except ImportError:
raise ImportError("This module can only be loaded in IPython.")
from google.api_core import client_info
from google.api_core import client_options
from google.api_core.exceptions import NotFound
import google.auth # type: ignore
from google.cloud import bigquery
import google.cloud.bigquery.dataset
from google.cloud.bigquery import _versions_helpers
from google.cloud.bigquery import exceptions
from google.cloud.bigquery.dbapi import _helpers
from google.cloud.bigquery.magics import line_arg_parser as lap
try:
import bigquery_magics # type: ignore
except ImportError:
bigquery_magics = None
IPYTHON_USER_AGENT = "ipython-{}".format(IPython.__version__) # type: ignore
class Context(object):
"""Storage for objects to be used throughout an IPython notebook session.
A Context object is initialized when the ``magics`` module is imported,
and can be found at ``google.cloud.bigquery.magics.context``.
"""
def __init__(self):
self._credentials = None
self._project = None
self._connection = None
self._default_query_job_config = bigquery.QueryJobConfig()
self._bigquery_client_options = client_options.ClientOptions()
self._bqstorage_client_options = client_options.ClientOptions()
self._progress_bar_type = "tqdm_notebook"
@property
def credentials(self):
"""google.auth.credentials.Credentials: Credentials to use for queries
performed through IPython magics.
Note:
These credentials do not need to be explicitly defined if you are
using Application Default Credentials. If you are not using
Application Default Credentials, manually construct a
:class:`google.auth.credentials.Credentials` object and set it as
the context credentials as demonstrated in the example below. See
`auth docs`_ for more information on obtaining credentials.
Example:
Manually setting the context credentials:
>>> from google.cloud.bigquery import magics
>>> from google.oauth2 import service_account
>>> credentials = (service_account
... .Credentials.from_service_account_file(
... '/path/to/key.json'))
>>> magics.context.credentials = credentials
.. _auth docs: http://google-auth.readthedocs.io
/en/latest/user-guide.html#obtaining-credentials
"""
if self._credentials is None:
self._credentials, _ = google.auth.default()
return self._credentials
@credentials.setter
def credentials(self, value):
self._credentials = value
@property
def project(self):
"""str: Default project to use for queries performed through IPython
magics.
Note:
The project does not need to be explicitly defined if you have an
environment default project set. If you do not have a default
project set in your environment, manually assign the project as
demonstrated in the example below.
Example:
Manually setting the context project:
>>> from google.cloud.bigquery import magics
>>> magics.context.project = 'my-project'
"""
if self._project is None:
_, self._project = google.auth.default()
return self._project
@project.setter
def project(self, value):
self._project = value
@property
def bigquery_client_options(self):
"""google.api_core.client_options.ClientOptions: client options to be
used through IPython magics.
Note::
The client options do not need to be explicitly defined if no
special network connections are required. Normally you would be
using the https://bigquery.googleapis.com/ end point.
Example:
Manually setting the endpoint:
>>> from google.cloud.bigquery import magics
>>> client_options = {}
>>> client_options['api_endpoint'] = "https://some.special.url"
>>> magics.context.bigquery_client_options = client_options
"""
return self._bigquery_client_options
@bigquery_client_options.setter
def bigquery_client_options(self, value):
self._bigquery_client_options = value
@property
def bqstorage_client_options(self):
"""google.api_core.client_options.ClientOptions: client options to be
used through IPython magics for the storage client.
Note::
The client options do not need to be explicitly defined if no
special network connections are required. Normally you would be
using the https://bigquerystorage.googleapis.com/ end point.
Example:
Manually setting the endpoint:
>>> from google.cloud.bigquery import magics
>>> client_options = {}
>>> client_options['api_endpoint'] = "https://some.special.url"
>>> magics.context.bqstorage_client_options = client_options
"""
return self._bqstorage_client_options
@bqstorage_client_options.setter
def bqstorage_client_options(self, value):
self._bqstorage_client_options = value
@property
def default_query_job_config(self):
"""google.cloud.bigquery.job.QueryJobConfig: Default job
configuration for queries.
The context's :class:`~google.cloud.bigquery.job.QueryJobConfig` is
used for queries. Some properties can be overridden with arguments to
the magics.
Example:
Manually setting the default value for ``maximum_bytes_billed``
to 100 MB:
>>> from google.cloud.bigquery import magics
>>> magics.context.default_query_job_config.maximum_bytes_billed = 100000000
"""
return self._default_query_job_config
@default_query_job_config.setter
def default_query_job_config(self, value):
self._default_query_job_config = value
@property
def progress_bar_type(self):
"""str: Default progress bar type to use to display progress bar while
executing queries through IPython magics.
Note::
Install the ``tqdm`` package to use this feature.
Example:
Manually setting the progress_bar_type:
>>> from google.cloud.bigquery import magics
>>> magics.context.progress_bar_type = "tqdm_notebook"
"""
return self._progress_bar_type
@progress_bar_type.setter
def progress_bar_type(self, value):
self._progress_bar_type = value
# If bigquery_magics is available, we load that extension rather than this one.
# Ensure google.cloud.bigquery.magics.context setters are on the correct magics
# implementation in case the user has installed the package but hasn't updated
# their code.
if bigquery_magics is not None:
context = bigquery_magics.context
else:
context = Context()
def _handle_error(error, destination_var=None):
"""Process a query execution error.
Args:
error (Exception):
An exception that occurred during the query execution.
destination_var (Optional[str]):
The name of the IPython session variable to store the query job.
"""
if destination_var:
query_job = getattr(error, "query_job", None)
if query_job is not None:
IPython.get_ipython().push({destination_var: query_job})
else:
# this is the case when previewing table rows by providing just
# table ID to cell magic
print(
"Could not save output to variable '{}'.".format(destination_var),
file=sys.stderr,
)
print("\nERROR:\n", str(error), file=sys.stderr)
def _run_query(client, query, job_config=None):
"""Runs a query while printing status updates
Args:
client (google.cloud.bigquery.client.Client):
Client to bundle configuration needed for API requests.
query (str):
SQL query to be executed. Defaults to the standard SQL dialect.
Use the ``job_config`` parameter to change dialects.
job_config (Optional[google.cloud.bigquery.job.QueryJobConfig]):
Extra configuration options for the job.
Returns:
google.cloud.bigquery.job.QueryJob: the query job created
Example:
>>> client = bigquery.Client()
>>> _run_query(client, "SELECT 17")
Executing query with job ID: bf633912-af2c-4780-b568-5d868058632b
Query executing: 1.66s
Query complete after 2.07s
'bf633912-af2c-4780-b568-5d868058632b'
"""
start_time = time.perf_counter()
query_job = client.query(query, job_config=job_config)
if job_config and job_config.dry_run:
return query_job
print(f"Executing query with job ID: {query_job.job_id}")
while True:
print(
f"\rQuery executing: {time.perf_counter() - start_time:.2f}s".format(),
end="",
)
try:
query_job.result(timeout=0.5)
break
except futures.TimeoutError:
continue
print(f"\nJob ID {query_job.job_id} successfully executed")
return query_job
def _create_dataset_if_necessary(client, dataset_id):
"""Create a dataset in the current project if it doesn't exist.
Args:
client (google.cloud.bigquery.client.Client):
Client to bundle configuration needed for API requests.
dataset_id (str):
Dataset id.
"""
dataset_reference = bigquery.dataset.DatasetReference(client.project, dataset_id)
try:
dataset = client.get_dataset(dataset_reference)
return
except NotFound:
pass
dataset = bigquery.Dataset(dataset_reference)
dataset.location = client.location
print(f"Creating dataset: {dataset_id}")
dataset = client.create_dataset(dataset)
@magic_arguments.magic_arguments()
@magic_arguments.argument(
"destination_var",
nargs="?",
help=("If provided, save the output to this variable instead of displaying it."),
)
@magic_arguments.argument(
"--destination_table",
type=str,
default=None,
help=(
"If provided, save the output of the query to a new BigQuery table. "
"Variable should be in a format <dataset_id>.<table_id>. "
"If table does not exists, it will be created. "
"If table already exists, its data will be overwritten."
),
)
@magic_arguments.argument(
"--project",
type=str,
default=None,
help=("Project to use for executing this query. Defaults to the context project."),
)
@magic_arguments.argument(
"--max_results",
default=None,
help=(
"Maximum number of rows in dataframe returned from executing the query."
"Defaults to returning all rows."
),
)
@magic_arguments.argument(
"--maximum_bytes_billed",
default=None,
help=(
"maximum_bytes_billed to use for executing this query. Defaults to "
"the context default_query_job_config.maximum_bytes_billed."
),
)
@magic_arguments.argument(
"--dry_run",
action="store_true",
default=False,
help=(
"Sets query to be a dry run to estimate costs. "
"Defaults to executing the query instead of dry run if this argument is not used."
),
)
@magic_arguments.argument(
"--use_legacy_sql",
action="store_true",
default=False,
help=(
"Sets query to use Legacy SQL instead of Standard SQL. Defaults to "
"Standard SQL if this argument is not used."
),
)
@magic_arguments.argument(
"--bigquery_api_endpoint",
type=str,
default=None,
help=(
"The desired API endpoint, e.g., bigquery.googlepis.com. Defaults to this "
"option's value in the context bigquery_client_options."
),
)
@magic_arguments.argument(
"--bqstorage_api_endpoint",
type=str,
default=None,
help=(
"The desired API endpoint, e.g., bigquerystorage.googlepis.com. Defaults to "
"this option's value in the context bqstorage_client_options."
),
)
@magic_arguments.argument(
"--no_query_cache",
action="store_true",
default=False,
help=("Do not use cached query results."),
)
@magic_arguments.argument(
"--use_bqstorage_api",
action="store_true",
default=None,
help=(
"[Deprecated] The BigQuery Storage API is already used by default to "
"download large query results, and this option has no effect. "
"If you want to switch to the classic REST API instead, use the "
"--use_rest_api option."
),
)
@magic_arguments.argument(
"--use_rest_api",
action="store_true",
default=False,
help=(
"Use the classic REST API instead of the BigQuery Storage API to "
"download query results."
),
)
@magic_arguments.argument(
"--verbose",
action="store_true",
default=False,
help=(
"If set, print verbose output, including the query job ID and the "
"amount of time for the query to finish. By default, this "
"information will be displayed as the query runs, but will be "
"cleared after the query is finished."
),
)
@magic_arguments.argument(
"--params",
nargs="+",
default=None,
help=(
"Parameters to format the query string. If present, the --params "
"flag should be followed by a string representation of a dictionary "
"in the format {'param_name': 'param_value'} (ex. {\"num\": 17}), "
"or a reference to a dictionary in the same format. The dictionary "
"reference can be made by including a '$' before the variable "
"name (ex. $my_dict_var)."
),
)
@magic_arguments.argument(
"--progress_bar_type",
type=str,
default=None,
help=(
"Sets progress bar type to display a progress bar while executing the query."
"Defaults to use tqdm_notebook. Install the ``tqdm`` package to use this feature."
),
)
@magic_arguments.argument(
"--location",
type=str,
default=None,
help=(
"Set the location to execute query."
"Defaults to location set in query setting in console."
),
)
def _cell_magic(line, query):
"""Underlying function for bigquery cell magic
Note:
This function contains the underlying logic for the 'bigquery' cell
magic. This function is not meant to be called directly.
Args:
line (str): "%%bigquery" followed by arguments as required
query (str): SQL query to run
Returns:
pandas.DataFrame: the query results.
"""
# The built-in parser does not recognize Python structures such as dicts, thus
# we extract the "--params" option and inteprpret it separately.
try:
params_option_value, rest_of_args = _split_args_line(line)
except lap.exceptions.QueryParamsParseError as exc:
rebranded_error = SyntaxError(
"--params is not a correctly formatted JSON string or a JSON "
"serializable dictionary"
)
raise rebranded_error from exc
except lap.exceptions.DuplicateQueryParamsError as exc:
rebranded_error = ValueError("Duplicate --params option.")
raise rebranded_error from exc
except lap.exceptions.ParseError as exc:
rebranded_error = ValueError(
"Unrecognized input, are option values correct? "
"Error details: {}".format(exc.args[0])
)
raise rebranded_error from exc
args = magic_arguments.parse_argstring(_cell_magic, rest_of_args)
if args.use_bqstorage_api is not None:
warnings.warn(
"Deprecated option --use_bqstorage_api, the BigQuery "
"Storage API is already used by default.",
category=DeprecationWarning,
)
use_bqstorage_api = not args.use_rest_api
location = args.location
params = []
if params_option_value:
# A non-existing params variable is not expanded and ends up in the input
# in its raw form, e.g. "$query_params".
if params_option_value.startswith("$"):
msg = 'Parameter expansion failed, undefined variable "{}".'.format(
params_option_value[1:]
)
raise NameError(msg)
params = _helpers.to_query_parameters(ast.literal_eval(params_option_value), {})
project = args.project or context.project
bigquery_client_options = copy.deepcopy(context.bigquery_client_options)
if args.bigquery_api_endpoint:
if isinstance(bigquery_client_options, dict):
bigquery_client_options["api_endpoint"] = args.bigquery_api_endpoint
else:
bigquery_client_options.api_endpoint = args.bigquery_api_endpoint
client = bigquery.Client(
project=project,
credentials=context.credentials,
default_query_job_config=context.default_query_job_config,
client_info=client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT),
client_options=bigquery_client_options,
location=location,
)
if context._connection:
client._connection = context._connection
bqstorage_client_options = copy.deepcopy(context.bqstorage_client_options)
if args.bqstorage_api_endpoint:
if isinstance(bqstorage_client_options, dict):
bqstorage_client_options["api_endpoint"] = args.bqstorage_api_endpoint
else:
bqstorage_client_options.api_endpoint = args.bqstorage_api_endpoint
bqstorage_client = _make_bqstorage_client(
client,
use_bqstorage_api,
bqstorage_client_options,
)
close_transports = functools.partial(_close_transports, client, bqstorage_client)
try:
if args.max_results:
max_results = int(args.max_results)
else:
max_results = None
query = query.strip()
if not query:
error = ValueError("Query is missing.")
_handle_error(error, args.destination_var)
return
# Check if query is given as a reference to a variable.
if query.startswith("$"):
query_var_name = query[1:]
if not query_var_name:
missing_msg = 'Missing query variable name, empty "$" is not allowed.'
raise NameError(missing_msg)
if query_var_name.isidentifier():
ip = IPython.get_ipython()
query = ip.user_ns.get(query_var_name, ip) # ip serves as a sentinel
if query is ip:
raise NameError(
f"Unknown query, variable {query_var_name} does not exist."
)
else:
if not isinstance(query, (str, bytes)):
raise TypeError(
f"Query variable {query_var_name} must be a string "
"or a bytes-like value."
)
# Any query that does not contain whitespace (aside from leading and trailing whitespace)
# is assumed to be a table id
if not re.search(r"\s", query):
try:
rows = client.list_rows(query, max_results=max_results)
except Exception as ex:
_handle_error(ex, args.destination_var)
return
result = rows.to_dataframe(
bqstorage_client=bqstorage_client,
create_bqstorage_client=False,
)
if args.destination_var:
IPython.get_ipython().push({args.destination_var: result})
return
else:
return result
job_config = bigquery.job.QueryJobConfig()
job_config.query_parameters = params
job_config.use_legacy_sql = args.use_legacy_sql
job_config.dry_run = args.dry_run
# Don't override context job config unless --no_query_cache is explicitly set.
if args.no_query_cache:
job_config.use_query_cache = False
if args.destination_table:
split = args.destination_table.split(".")
if len(split) != 2:
raise ValueError(
"--destination_table should be in a <dataset_id>.<table_id> format."
)
dataset_id, table_id = split
job_config.allow_large_results = True
dataset_ref = bigquery.dataset.DatasetReference(client.project, dataset_id)
destination_table_ref = dataset_ref.table(table_id)
job_config.destination = destination_table_ref
job_config.create_disposition = "CREATE_IF_NEEDED"
job_config.write_disposition = "WRITE_TRUNCATE"
_create_dataset_if_necessary(client, dataset_id)
if args.maximum_bytes_billed == "None":
job_config.maximum_bytes_billed = 0
elif args.maximum_bytes_billed is not None:
value = int(args.maximum_bytes_billed)
job_config.maximum_bytes_billed = value
try:
query_job = _run_query(client, query, job_config=job_config)
except Exception as ex:
_handle_error(ex, args.destination_var)
return
if not args.verbose:
display.clear_output()
if args.dry_run and args.destination_var:
IPython.get_ipython().push({args.destination_var: query_job})
return
elif args.dry_run:
print(
"Query validated. This query will process {} bytes.".format(
query_job.total_bytes_processed
)
)
return query_job
progress_bar = context.progress_bar_type or args.progress_bar_type
if max_results:
result = query_job.result(max_results=max_results).to_dataframe(
bqstorage_client=None,
create_bqstorage_client=False,
progress_bar_type=progress_bar,
)
else:
result = query_job.to_dataframe(
bqstorage_client=bqstorage_client,
create_bqstorage_client=False,
progress_bar_type=progress_bar,
)
if args.destination_var:
IPython.get_ipython().push({args.destination_var: result})
else:
return result
finally:
close_transports()
def _split_args_line(line):
"""Split out the --params option value from the input line arguments.
Args:
line (str): The line arguments passed to the cell magic.
Returns:
Tuple[str, str]
"""
lexer = lap.Lexer(line)
scanner = lap.Parser(lexer)
tree = scanner.input_line()
extractor = lap.QueryParamsExtractor()
params_option_value, rest_of_args = extractor.visit(tree)
return params_option_value, rest_of_args
def _make_bqstorage_client(client, use_bqstorage_api, client_options):
"""Creates a BigQuery Storage client.
Args:
client (:class:`~google.cloud.bigquery.client.Client`): BigQuery client.
use_bqstorage_api (bool): whether BigQuery Storage API is used or not.
client_options (:class:`google.api_core.client_options.ClientOptions`):
Custom options used with a new BigQuery Storage client instance
if one is created.
Raises:
ImportError: if google-cloud-bigquery-storage is not installed, or
grpcio package is not installed.
Returns:
None: if ``use_bqstorage_api == False``, or google-cloud-bigquery-storage
is outdated.
BigQuery Storage Client:
"""
if not use_bqstorage_api:
return None
try:
_versions_helpers.BQ_STORAGE_VERSIONS.try_import(raise_if_error=True)
except exceptions.BigQueryStorageNotFoundError as err:
customized_error = ImportError(
"The default BigQuery Storage API client cannot be used, install "
"the missing google-cloud-bigquery-storage and pyarrow packages "
"to use it. Alternatively, use the classic REST API by specifying "
"the --use_rest_api magic option."
)
raise customized_error from err
except exceptions.LegacyBigQueryStorageError:
pass
try:
from google.api_core.gapic_v1 import client_info as gapic_client_info
except ImportError as err:
customized_error = ImportError(
"Install the grpcio package to use the BigQuery Storage API."
)
raise customized_error from err
return client._ensure_bqstorage_client(
client_options=client_options,
client_info=gapic_client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT),
)
def _close_transports(client, bqstorage_client):
"""Close the given clients' underlying transport channels.
Closing the transport is needed to release system resources, namely open
sockets.
Args:
client (:class:`~google.cloud.bigquery.client.Client`):
bqstorage_client
(Optional[:class:`~google.cloud.bigquery_storage.BigQueryReadClient`]):
A client for the BigQuery Storage API.
"""
client.close()
if bqstorage_client is not None:
bqstorage_client._transport.grpc_channel.close()