feat: add custom cell magic parser to handle complex --params values (

#213) * chore: Move cell magic code into its own directory * Add custom argument parser for cell magic * Add AST node visitor * Use a custom parser for cell magic arguments * Improve cell magic parser test coverage * Generalize valid option values The parser should accept as wide a range of values as possible and let the code that delas with the semantics to decide whether the values are good or not. * Fix recognizing --params option in state 3 The --params option spec must be followed by a non-alphanumeric character, otherwise it's a different option spec (e.g. --paramsX). * Fix typo in comment * Cover missing parser code path with a test * Preserve the cell magic context's import path The context still needs to be importable from the old path * Clarify lexer states * Replace re.scanner with finditer() * Fix typo in docstring * Simplify string literal in a single line Apparently black just places all implicitly concatenated string literals in a single line when short enough without replacing them with a single string literal. * Explain the visitors module. * Pass pos as a positional arg to finditer() This is necessary to retain Python 2 compatibility. * Resolve coverage complaint about a code path The tokens are designed in a way that the scanner *always* returns some match, even if just UNKNOWN or EOL. The "no matches" code path can thus never be taken, but the coverage check can't know that.
googleapis · Sep 9, 2020 · dcfbac2 · dcfbac2
1 parent aa1613c
commit dcfbac2
Show file tree

Hide file tree

Showing 14 changed files with 1,644 additions and 65 deletions.
diff --git a/docs/magics.rst b/docs/magics.rst
@@ -1,5 +1,5 @@
 IPython Magics for BigQuery
 ===========================
 
-.. automodule:: google.cloud.bigquery.magics
+.. automodule:: google.cloud.bigquery.magics.magics
     :members:
diff --git a/google/cloud/bigquery/__init__.py b/google/cloud/bigquery/__init__.py
@@ -150,7 +150,7 @@
 
 def load_ipython_extension(ipython):
     """Called by IPython when this module is loaded as an IPython extension."""
-    from google.cloud.bigquery.magics import _cell_magic
+    from google.cloud.bigquery.magics.magics import _cell_magic
 
     ipython.register_magic_function(
         _cell_magic, magic_kind="cell", magic_name="bigquery"

diff --git a/google/cloud/bigquery/magics/__init__.py b/google/cloud/bigquery/magics/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from google.cloud.bigquery.magics.magics import context
+
+
+# For backwards compatibility we need to make the context available in the path
+# google.cloud.bigquery.magics.context
+__all__ = ("context",)
diff --git a/google/cloud/bigquery/magics/line_arg_parser/__init__.py b/google/cloud/bigquery/magics/line_arg_parser/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from google.cloud.bigquery.magics.line_arg_parser.exceptions import ParseError
+from google.cloud.bigquery.magics.line_arg_parser.exceptions import (
+    DuplicateQueryParamsError,
+    QueryParamsParseError,
+)
+from google.cloud.bigquery.magics.line_arg_parser.lexer import Lexer
+from google.cloud.bigquery.magics.line_arg_parser.lexer import TokenType
+from google.cloud.bigquery.magics.line_arg_parser.parser import Parser
+from google.cloud.bigquery.magics.line_arg_parser.visitors import QueryParamsExtractor
+
+
+__all__ = (
+    "DuplicateQueryParamsError",
+    "Lexer",
+    "Parser",
+    "ParseError",
+    "QueryParamsExtractor",
+    "QueryParamsParseError",
+    "TokenType",
+)
diff --git a/google/cloud/bigquery/magics/line_arg_parser/exceptions.py b/google/cloud/bigquery/magics/line_arg_parser/exceptions.py
@@ -0,0 +1,25 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class ParseError(Exception):
+    pass
+
+
+class QueryParamsParseError(ParseError):
+    """Raised when --params option is syntactically incorrect."""
+
+
+class DuplicateQueryParamsError(ParseError):
+    pass
diff --git a/google/cloud/bigquery/magics/line_arg_parser/lexer.py b/google/cloud/bigquery/magics/line_arg_parser/lexer.py
@@ -0,0 +1,268 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+from collections import OrderedDict
+import itertools
+import re
+
+import enum
+
+
+Token = namedtuple("Token", ("type_", "lexeme", "pos"))
+StateTransition = namedtuple("StateTransition", ("new_state", "total_offset"))
+
+# Pattern matching is done with regexes, and the order in which the token patterns are
+# defined is important.
+#
+# Suppose we had the following token definitions:
+#  * INT - a token matching integers,
+#  * FLOAT - a token matching floating point numbers,
+#  * DOT - a token matching a single literal dot character, i.e. "."
+#
+# The FLOAT token would have to be defined first, since we would want the input "1.23"
+# to be tokenized as a single FLOAT token, and *not* three tokens (INT, DOT, INT).
+#
+# Sometimes, however, different tokens match too similar patterns, and it is not
+# possible to define them in order that would avoid any ambiguity. One such case are
+# the OPT_VAL and PY_NUMBER tokens, as both can match an integer literal, say "42".
+#
+# In order to avoid the dilemmas, the lexer implements a concept of STATES. States are
+# used to split token definitions into subgroups, and in each lexer state only a single
+# subgroup is used for tokenizing the input. Lexer states can therefore be though of as
+# token namespaces.
+#
+# For example, while parsing the value of the "--params" option, we do not want to
+# "recognize" it as a single OPT_VAL token, but instead want to parse it as a Python
+# dictionary and verify its syntactial correctness. On the other hand, while parsing
+# the value of an option other than "--params", we do not really care about its
+# structure, and thus do not want to use any of the "Python tokens" for pattern matching.
+#
+# Since token definition order is important, an OrderedDict is needed with tightly
+# controlled member definitions (i.e. passed as a sequence, and *not* via kwargs).
+token_types = OrderedDict(
+    [
+        (
+            "state_parse_pos_args",
+            OrderedDict(
+                [
+                    (
+                        "GOTO_PARSE_NON_PARAMS_OPTIONS",
+                        r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--))",  # double dash - starting the options list
+                    ),
+                    (
+                        "DEST_VAR",
+                        r"(?P<DEST_VAR>[^\d\W]\w*)",  # essentially a Python ID
+                    ),
+                ]
+            ),
+        ),
+        (
+            "state_parse_non_params_options",
+            OrderedDict(
+                [
+                    (
+                        "GOTO_PARSE_PARAMS_OPTION",
+                        r"(?P<GOTO_PARSE_PARAMS_OPTION>(?=--params(?:\s|=|--|$)))",  # the --params option
+                    ),
+                    ("OPTION_SPEC", r"(?P<OPTION_SPEC>--\w+)"),
+                    ("OPTION_EQ", r"(?P<OPTION_EQ>=)"),
+                    ("OPT_VAL", r"(?P<OPT_VAL>\S+?(?=\s|--|$))"),
+                ]
+            ),
+        ),
+        (
+            "state_parse_params_option",
+            OrderedDict(
+                [
+                    (
+                        "PY_STRING",
+                        r"(?P<PY_STRING>(?:{})|(?:{}))".format(
+                            r"'(?:[^'\\]|\.)*'",
+                            r'"(?:[^"\\]|\.)*"',  # single and double quoted strings
+                        ),
+                    ),
+                    ("PARAMS_OPT_SPEC", r"(?P<PARAMS_OPT_SPEC>--params(?=\s|=|--|$))"),
+                    ("PARAMS_OPT_EQ", r"(?P<PARAMS_OPT_EQ>=)"),
+                    (
+                        "GOTO_PARSE_NON_PARAMS_OPTIONS",
+                        r"(?P<GOTO_PARSE_NON_PARAMS_OPTIONS>(?=--\w+))",  # found another option spec
+                    ),
+                    ("PY_BOOL", r"(?P<PY_BOOL>True|False)"),
+                    ("DOLLAR_PY_ID", r"(?P<DOLLAR_PY_ID>\$[^\d\W]\w*)"),
+                    (
+                        "PY_NUMBER",
+                        r"(?P<PY_NUMBER>-?[1-9]\d*(?:\.\d+)?(:?[e|E][+-]?\d+)?)",
+                    ),
+                    ("SQUOTE", r"(?P<SQUOTE>')"),
+                    ("DQUOTE", r'(?P<DQUOTE>")'),
+                    ("COLON", r"(?P<COLON>:)"),
+                    ("COMMA", r"(?P<COMMA>,)"),
+                    ("LCURL", r"(?P<LCURL>\{)"),
+                    ("RCURL", r"(?P<RCURL>})"),
+                    ("LSQUARE", r"(?P<LSQUARE>\[)"),
+                    ("RSQUARE", r"(?P<RSQUARE>])"),
+                    ("LPAREN", r"(?P<LPAREN>\()"),
+                    ("RPAREN", r"(?P<RPAREN>\))"),
+                ]
+            ),
+        ),
+        (
+            "common",
+            OrderedDict(
+                [
+                    ("WS", r"(?P<WS>\s+)"),
+                    ("EOL", r"(?P<EOL>$)"),
+                    (
+                        # anything not a whitespace or matched by something else
+                        "UNKNOWN",
+                        r"(?P<UNKNOWN>\S+)",
+                    ),
+                ]
+            ),
+        ),
+    ]
+)
+
+
+# The _generate_next_value_() enum hook is only available in Python 3.6+, thus we
+# need to do some acrobatics to implement an "auto str enum" base class. Implementation
+# based on the recipe provided by the very author of the Enum library:
+# https://stackoverflow.com/a/32313954/5040035
+class StrEnumMeta(enum.EnumMeta):
+    @classmethod
+    def __prepare__(metacls, name, bases, **kwargs):
+        # Having deterministic enum members definition order is nice.
+        return OrderedDict()
+
+    def __new__(metacls, name, bases, oldclassdict):
+        # Scan through the declared enum members and convert any value that is a plain
+        # empty tuple into a `str` of the name instead.
+        newclassdict = enum._EnumDict()
+        for key, val in oldclassdict.items():
+            if val == ():
+                val = key
+            newclassdict[key] = val
+        return super(StrEnumMeta, metacls).__new__(metacls, name, bases, newclassdict)
+
+
+# The @six.add_metaclass decorator does not work, Enum complains about _sunder_ names,
+# and we cannot use class syntax directly, because the Python 3 version would cause
+# a syntax error under Python 2.
+AutoStrEnum = StrEnumMeta(
+    "AutoStrEnum",
+    (str, enum.Enum),
+    {"__doc__": "Base enum class for for name=value str enums."},
+)
+
+TokenType = AutoStrEnum(
+    "TokenType",
+    [
+        (name, name)
+        for name in itertools.chain.from_iterable(token_types.values())
+        if not name.startswith("GOTO_")
+    ],
+)
+
+
+class LexerState(AutoStrEnum):
+    PARSE_POS_ARGS = ()  # parsing positional arguments
+    PARSE_NON_PARAMS_OPTIONS = ()  # parsing options other than "--params"
+    PARSE_PARAMS_OPTION = ()  # parsing the "--params" option
+    STATE_END = ()
+
+
+class Lexer(object):
+    """Lexical analyzer for tokenizing the cell magic input line."""
+
+    _GRAND_PATTERNS = {
+        LexerState.PARSE_POS_ARGS: re.compile(
+            "|".join(
+                itertools.chain(
+                    token_types["state_parse_pos_args"].values(),
+                    token_types["common"].values(),
+                )
+            )
+        ),
+        LexerState.PARSE_NON_PARAMS_OPTIONS: re.compile(
+            "|".join(
+                itertools.chain(
+                    token_types["state_parse_non_params_options"].values(),
+                    token_types["common"].values(),
+                )
+            )
+        ),
+        LexerState.PARSE_PARAMS_OPTION: re.compile(
+            "|".join(
+                itertools.chain(
+                    token_types["state_parse_params_option"].values(),
+                    token_types["common"].values(),
+                )
+            )
+        ),
+    }
+
+    def __init__(self, input_text):
+        self._text = input_text
+
+    def __iter__(self):
+        # Since re.scanner does not seem to support manipulating inner scanner states,
+        # we need to implement lexer state transitions manually using special
+        # non-capturing lookahead token patterns to signal when a state transition
+        # should be made.
+        # Since we don't have "nested" states, we don't really need a stack and
+        # this simple mechanism is sufficient.
+        state = LexerState.PARSE_POS_ARGS
+        offset = 0  # the number of characters processed so far
+
+        while state != LexerState.STATE_END:
+            token_stream = self._find_state_tokens(state, offset)
+
+            for maybe_token in token_stream:  # pragma: NO COVER
+                if isinstance(maybe_token, StateTransition):
+                    state = maybe_token.new_state
+                    offset = maybe_token.total_offset
+                    break
+
+                if maybe_token.type_ != TokenType.WS:
+                    yield maybe_token
+
+                if maybe_token.type_ == TokenType.EOL:
+                    state = LexerState.STATE_END
+                    break
+
+    def _find_state_tokens(self, state, current_offset):
+        """Scan the input for current state's tokens starting at ``current_offset``.
+
+        Args:
+            state (LexerState): The current lexer state.
+            current_offset (int): The offset in the input text, i.e. the number
+                of characters already scanned so far.
+
+        Yields:
+            The next ``Token`` or ``StateTransition`` instance.
+        """
+        pattern = self._GRAND_PATTERNS[state]
+        scanner = pattern.finditer(self._text, current_offset)
+
+        for match in scanner:  # pragma: NO COVER
+            token_type = match.lastgroup
+
+            if token_type.startswith("GOTO_"):
+                yield StateTransition(
+                    new_state=getattr(LexerState, token_type[5:]),  # w/o "GOTO_" prefix
+                    total_offset=match.start(),
+                )
+
+            yield Token(token_type, match.group(), match.start())