-
Notifications
You must be signed in to change notification settings - Fork 306
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add custom cell magic parser to handle complex
--params
values (
#213) * chore: Move cell magic code into its own directory * Add custom argument parser for cell magic * Add AST node visitor * Use a custom parser for cell magic arguments * Improve cell magic parser test coverage * Generalize valid option values The parser should accept as wide a range of values as possible and let the code that delas with the semantics to decide whether the values are good or not. * Fix recognizing --params option in state 3 The --params option spec must be followed by a non-alphanumeric character, otherwise it's a different option spec (e.g. --paramsX). * Fix typo in comment * Cover missing parser code path with a test * Preserve the cell magic context's import path The context still needs to be importable from the old path * Clarify lexer states * Replace re.scanner with finditer() * Fix typo in docstring * Simplify string literal in a single line Apparently black just places all implicitly concatenated string literals in a single line when short enough without replacing them with a single string literal. * Explain the visitors module. * Pass pos as a positional arg to finditer() This is necessary to retain Python 2 compatibility. * Resolve coverage complaint about a code path The tokens are designed in a way that the scanner *always* returns some match, even if just UNKNOWN or EOL. The "no matches" code path can thus never be taken, but the coverage check can't know that.
- Loading branch information
Showing
14 changed files
with
1,644 additions
and
65 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
IPython Magics for BigQuery | ||
=========================== | ||
|
||
.. automodule:: google.cloud.bigquery.magics | ||
.. automodule:: google.cloud.bigquery.magics.magics | ||
:members: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from google.cloud.bigquery.magics.magics import context | ||
|
||
|
||
# For backwards compatibility we need to make the context available in the path | ||
# google.cloud.bigquery.magics.context | ||
__all__ = ("context",) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from google.cloud.bigquery.magics.line_arg_parser.exceptions import ParseError | ||
from google.cloud.bigquery.magics.line_arg_parser.exceptions import ( | ||
DuplicateQueryParamsError, | ||
QueryParamsParseError, | ||
) | ||
from google.cloud.bigquery.magics.line_arg_parser.lexer import Lexer | ||
from google.cloud.bigquery.magics.line_arg_parser.lexer import TokenType | ||
from google.cloud.bigquery.magics.line_arg_parser.parser import Parser | ||
from google.cloud.bigquery.magics.line_arg_parser.visitors import QueryParamsExtractor | ||
|
||
|
||
__all__ = ( | ||
"DuplicateQueryParamsError", | ||
"Lexer", | ||
"Parser", | ||
"ParseError", | ||
"QueryParamsExtractor", | ||
"QueryParamsParseError", | ||
"TokenType", | ||
) |
25 changes: 25 additions & 0 deletions
25
google/cloud/bigquery/magics/line_arg_parser/exceptions.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
class ParseError(Exception): | ||
pass | ||
|
||
|
||
class QueryParamsParseError(ParseError): | ||
"""Raised when --params option is syntactically incorrect.""" | ||
|
||
|
||
class DuplicateQueryParamsError(ParseError): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,268 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from collections import namedtuple | ||
from collections import OrderedDict | ||
import itertools | ||
import re | ||
|
||
import enum | ||
|
||
|
||
Token = namedtuple("Token", ("type_", "lexeme", "pos")) | ||
StateTransition = namedtuple("StateTransition", ("new_state", "total_offset")) | ||
|
||
# Pattern matching is done with regexes, and the order in which the token patterns are | ||
# defined is important. | ||
# | ||
# Suppose we had the following token definitions: | ||
# * INT - a token matching integers, | ||
# * FLOAT - a token matching floating point numbers, | ||
# * DOT - a token matching a single literal dot character, i.e. "." | ||
# | ||
# The FLOAT token would have to be defined first, since we would want the input "1.23" | ||
# to be tokenized as a single FLOAT token, and *not* three tokens (INT, DOT, INT). | ||
# | ||
# Sometimes, however, different tokens match too similar patterns, and it is not | ||
# possible to define them in order that would avoid any ambiguity. One such case are | ||
# the OPT_VAL and PY_NUMBER tokens, as both can match an integer literal, say "42". | ||
# | ||
# In order to avoid the dilemmas, the lexer implements a concept of STATES. States are | ||
# used to split token definitions into subgroups, and in each lexer state only a single | ||
# subgroup is used for tokenizing the input. Lexer states can therefore be though of as | ||
# token namespaces. | ||
# | ||
# For example, while parsing the value of the "--params" option, we do not want to | ||
# "recognize" it as a single OPT_VAL token, but instead want to parse it as a Python | ||
# dictionary and verify its syntactial correctness. On the other hand, while parsing | ||
# the value of an option other than "--params", we do not really care about its | ||
# structure, and thus do not want to use any of the "Python tokens" for pattern matching. | ||
# | ||
# Since token definition order is important, an OrderedDict is needed with tightly | ||
# controlled member definitions (i.e. passed as a sequence, and *not* via kwargs). | ||
token_types = OrderedDict( | ||
[ | ||
( | ||
"state_parse_pos_args", | ||
OrderedDict( | ||
[ | ||
( | ||
"GOTO_PARSE_NON_PARAMS_OPTIONS", | ||
r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--))", # double dash - starting the options list | ||
), | ||
( | ||
"DEST_VAR", | ||
r"(?P<DEST_VAR>[^\d\W]\w*)", # essentially a Python ID | ||
), | ||
] | ||
), | ||
), | ||
( | ||
"state_parse_non_params_options", | ||
OrderedDict( | ||
[ | ||
( | ||
"GOTO_PARSE_PARAMS_OPTION", | ||
r"(?P<GOTO_PARSE_PARAMS_OPTION>(?=--params(?:\s|=|--|$)))", # the --params option | ||
), | ||
("OPTION_SPEC", r"(?P<OPTION_SPEC>--\w+)"), | ||
("OPTION_EQ", r"(?P<OPTION_EQ>=)"), | ||
("OPT_VAL", r"(?P<OPT_VAL>\S+?(?=\s|--|$))"), | ||
] | ||
), | ||
), | ||
( | ||
"state_parse_params_option", | ||
OrderedDict( | ||
[ | ||
( | ||
"PY_STRING", | ||
r"(?P<PY_STRING>(?:{})|(?:{}))".format( | ||
r"'(?:[^'\\]|\.)*'", | ||
r'"(?:[^"\\]|\.)*"', # single and double quoted strings | ||
), | ||
), | ||
("PARAMS_OPT_SPEC", r"(?P<PARAMS_OPT_SPEC>--params(?=\s|=|--|$))"), | ||
("PARAMS_OPT_EQ", r"(?P<PARAMS_OPT_EQ>=)"), | ||
( | ||
"GOTO_PARSE_NON_PARAMS_OPTIONS", | ||
r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--\w+))", # found another option spec | ||
), | ||
("PY_BOOL", r"(?P<PY_BOOL>True|False)"), | ||
("DOLLAR_PY_ID", r"(?P<DOLLAR_PY_ID>\$[^\d\W]\w*)"), | ||
( | ||
"PY_NUMBER", | ||
r"(?P<PY_NUMBER>-?[1-9]\d*(?:\.\d+)?(:?[e|E][+-]?\d+)?)", | ||
), | ||
("SQUOTE", r"(?P<SQUOTE>')"), | ||
("DQUOTE", r'(?P<DQUOTE>")'), | ||
("COLON", r"(?P<COLON>:)"), | ||
("COMMA", r"(?P<COMMA>,)"), | ||
("LCURL", r"(?P<LCURL>\{)"), | ||
("RCURL", r"(?P<RCURL>})"), | ||
("LSQUARE", r"(?P<LSQUARE>\[)"), | ||
("RSQUARE", r"(?P<RSQUARE>])"), | ||
("LPAREN", r"(?P<LPAREN>\()"), | ||
("RPAREN", r"(?P<RPAREN>\))"), | ||
] | ||
), | ||
), | ||
( | ||
"common", | ||
OrderedDict( | ||
[ | ||
("WS", r"(?P<WS>\s+)"), | ||
("EOL", r"(?P<EOL>$)"), | ||
( | ||
# anything not a whitespace or matched by something else | ||
"UNKNOWN", | ||
r"(?P<UNKNOWN>\S+)", | ||
), | ||
] | ||
), | ||
), | ||
] | ||
) | ||
|
||
|
||
# The _generate_next_value_() enum hook is only available in Python 3.6+, thus we | ||
# need to do some acrobatics to implement an "auto str enum" base class. Implementation | ||
# based on the recipe provided by the very author of the Enum library: | ||
# https://stackoverflow.com/a/32313954/5040035 | ||
class StrEnumMeta(enum.EnumMeta): | ||
@classmethod | ||
def __prepare__(metacls, name, bases, **kwargs): | ||
# Having deterministic enum members definition order is nice. | ||
return OrderedDict() | ||
|
||
def __new__(metacls, name, bases, oldclassdict): | ||
# Scan through the declared enum members and convert any value that is a plain | ||
# empty tuple into a `str` of the name instead. | ||
newclassdict = enum._EnumDict() | ||
for key, val in oldclassdict.items(): | ||
if val == (): | ||
val = key | ||
newclassdict[key] = val | ||
return super(StrEnumMeta, metacls).__new__(metacls, name, bases, newclassdict) | ||
|
||
|
||
# The @six.add_metaclass decorator does not work, Enum complains about _sunder_ names, | ||
# and we cannot use class syntax directly, because the Python 3 version would cause | ||
# a syntax error under Python 2. | ||
AutoStrEnum = StrEnumMeta( | ||
"AutoStrEnum", | ||
(str, enum.Enum), | ||
{"__doc__": "Base enum class for for name=value str enums."}, | ||
) | ||
|
||
TokenType = AutoStrEnum( | ||
"TokenType", | ||
[ | ||
(name, name) | ||
for name in itertools.chain.from_iterable(token_types.values()) | ||
if not name.startswith("GOTO_") | ||
], | ||
) | ||
|
||
|
||
class LexerState(AutoStrEnum): | ||
PARSE_POS_ARGS = () # parsing positional arguments | ||
PARSE_NON_PARAMS_OPTIONS = () # parsing options other than "--params" | ||
PARSE_PARAMS_OPTION = () # parsing the "--params" option | ||
STATE_END = () | ||
|
||
|
||
class Lexer(object): | ||
"""Lexical analyzer for tokenizing the cell magic input line.""" | ||
|
||
_GRAND_PATTERNS = { | ||
LexerState.PARSE_POS_ARGS: re.compile( | ||
"|".join( | ||
itertools.chain( | ||
token_types["state_parse_pos_args"].values(), | ||
token_types["common"].values(), | ||
) | ||
) | ||
), | ||
LexerState.PARSE_NON_PARAMS_OPTIONS: re.compile( | ||
"|".join( | ||
itertools.chain( | ||
token_types["state_parse_non_params_options"].values(), | ||
token_types["common"].values(), | ||
) | ||
) | ||
), | ||
LexerState.PARSE_PARAMS_OPTION: re.compile( | ||
"|".join( | ||
itertools.chain( | ||
token_types["state_parse_params_option"].values(), | ||
token_types["common"].values(), | ||
) | ||
) | ||
), | ||
} | ||
|
||
def __init__(self, input_text): | ||
self._text = input_text | ||
|
||
def __iter__(self): | ||
# Since re.scanner does not seem to support manipulating inner scanner states, | ||
# we need to implement lexer state transitions manually using special | ||
# non-capturing lookahead token patterns to signal when a state transition | ||
# should be made. | ||
# Since we don't have "nested" states, we don't really need a stack and | ||
# this simple mechanism is sufficient. | ||
state = LexerState.PARSE_POS_ARGS | ||
offset = 0 # the number of characters processed so far | ||
|
||
while state != LexerState.STATE_END: | ||
token_stream = self._find_state_tokens(state, offset) | ||
|
||
for maybe_token in token_stream: # pragma: NO COVER | ||
if isinstance(maybe_token, StateTransition): | ||
state = maybe_token.new_state | ||
offset = maybe_token.total_offset | ||
break | ||
|
||
if maybe_token.type_ != TokenType.WS: | ||
yield maybe_token | ||
|
||
if maybe_token.type_ == TokenType.EOL: | ||
state = LexerState.STATE_END | ||
break | ||
|
||
def _find_state_tokens(self, state, current_offset): | ||
"""Scan the input for current state's tokens starting at ``current_offset``. | ||
Args: | ||
state (LexerState): The current lexer state. | ||
current_offset (int): The offset in the input text, i.e. the number | ||
of characters already scanned so far. | ||
Yields: | ||
The next ``Token`` or ``StateTransition`` instance. | ||
""" | ||
pattern = self._GRAND_PATTERNS[state] | ||
scanner = pattern.finditer(self._text, current_offset) | ||
|
||
for match in scanner: # pragma: NO COVER | ||
token_type = match.lastgroup | ||
|
||
if token_type.startswith("GOTO_"): | ||
yield StateTransition( | ||
new_state=getattr(LexerState, token_type[5:]), # w/o "GOTO_" prefix | ||
total_offset=match.start(), | ||
) | ||
|
||
yield Token(token_type, match.group(), match.start()) |
Oops, something went wrong.