Add support for byte and unicode Literal strings (python#6087)

This pull request adds support for byte and unicode Literal strings. I left in some comments explaining some nuances of the implementation; here are a few additional meta-notes: 1. I reworded several of the comments suggesting that the way we represent bytes as a string is a "hack" or that we should eventually switch to representing bytes as literally bytes. I started with that approach but ultimately rejected it: I ended up having to constantly serialize/deserialize between bytes and strings, which I felt complicated the code. As a result, I decided that the solution we had previously is in fact, from a high-level perspective, the best possible approach. (The actual code for translating the output of `typed_ast` into a human-readable string *is* admittedly a bit hacky though.) In any case, the phrase "how mypy currently parses the contents of bytes literals" is severely out-of-date anyways. That comment was added about 3 years ago, when we were adding the fast parser for the first time and running it concurrently with the actual parser. 2. I removed the `is_stub` field from `fastparse2.ASTConverter`: it turned out we were just never using that field. 3. One complication I ran into was figuring out how to handle forward references to literal strings. For example, suppose we have the type `List["Literal['foo']"]`. Do we treat this as being equivalent to `List[Literal[u'foo']]` or `List[Literal[b'foo']]`? If this is a Python 3 file or a Python 2 file with `unicode_literals`, we'd want to pick the former. If this is a standard Python 2 file, we'd want to pick the latter. In order to make this happen, I decided to use a heuristic where the type of the "outer" string decides the type of the "inner" string. For example: - In Python 3, `"Literal['foo']"` is a unicode string. So, the inner `Literal['foo']` will be treated as the same as `Literal[u'foo']`. - The same thing happens when using Python 2 with `unicode_literals`. - In Python 3, it is illegal to use a byte string as a forward reference. So, types like `List[b"Literal['foo']"]` are already illegal. - In standard Python 2, `"Literal['foo']"` is a byte string. So the inner `Literal['foo']` will be treated as the same as `Literal[u'foo']`. 4. I will add tests validating that all of this stuff works as expected with incremental and fine-grained mode in a separate diff -- probably after fixing and landing python#6075, which I intend to use as a baseline foundation.
Michael0x2a · Dec 28, 2018 · fcee66d · fcee66d
1 parent 9a3fa64
commit fcee66d
Show file tree

Hide file tree

Showing 10 changed files with 668 additions and 68 deletions.
diff --git a/mypy/checkexpr.py b/mypy/checkexpr.py
@@ -1784,11 +1784,17 @@ def visit_str_expr(self, e: StrExpr) -> Type:
 
     def visit_bytes_expr(self, e: BytesExpr) -> Type:
         """Type check a bytes literal (trivial)."""
-        return self.named_type('builtins.bytes')
+        typ = self.named_type('builtins.bytes')
+        if is_literal_type_like(self.type_context[-1]):
+            return LiteralType(value=e.value, fallback=typ)
+        return typ
 
     def visit_unicode_expr(self, e: UnicodeExpr) -> Type:
         """Type check a unicode literal (trivial)."""
-        return self.named_type('builtins.unicode')
+        typ = self.named_type('builtins.unicode')
+        if is_literal_type_like(self.type_context[-1]):
+            return LiteralType(value=e.value, fallback=typ)
+        return typ
 
     def visit_float_expr(self, e: FloatExpr) -> Type:
         """Type check a float literal (trivial)."""

diff --git a/mypy/exprtotype.py b/mypy/exprtotype.py
@@ -5,7 +5,7 @@
     ListExpr, StrExpr, BytesExpr, UnicodeExpr, EllipsisExpr, CallExpr,
     get_member_expr_fullname
 )
-from mypy.fastparse import parse_type_comment, parse_type_string
+from mypy.fastparse import parse_type_string
 from mypy.types import (
     Type, UnboundType, TypeList, EllipsisType, AnyType, Optional, CallableArgument, TypeOfAny,
     RawLiteralType,
@@ -111,8 +111,15 @@ def expr_to_unanalyzed_type(expr: Expression, _parent: Optional[Expression] = No
     elif isinstance(expr, ListExpr):
         return TypeList([expr_to_unanalyzed_type(t, expr) for t in expr.items],
                         line=expr.line, column=expr.column)
-    elif isinstance(expr, (StrExpr, BytesExpr, UnicodeExpr)):
-        return parse_type_string(expr.value, expr.line, expr.column)
+    elif isinstance(expr, StrExpr):
+        return parse_type_string(expr.value, 'builtins.str', expr.line, expr.column,
+                                 assume_str_is_unicode=expr.from_python_3)
+    elif isinstance(expr, BytesExpr):
+        return parse_type_string(expr.value, 'builtins.bytes', expr.line, expr.column,
+                                 assume_str_is_unicode=False)
+    elif isinstance(expr, UnicodeExpr):
+        return parse_type_string(expr.value, 'builtins.unicode', expr.line, expr.column,
+                                 assume_str_is_unicode=True)
     elif isinstance(expr, UnaryExpr):
         typ = expr_to_unanalyzed_type(expr.expr)
         if isinstance(typ, RawLiteralType) and isinstance(typ.value, int) and expr.op == '-':

diff --git a/mypy/fastparse.py b/mypy/fastparse.py
@@ -51,6 +51,7 @@
         NameConstant,
         Expression as ast3_Expression,
         Str,
+        Bytes,
         Index,
         Num,
         UnaryOp,
@@ -140,7 +141,11 @@ def parse(source: Union[str, bytes],
     return tree
 
 
-def parse_type_comment(type_comment: str, line: int, errors: Optional[Errors]) -> Optional[Type]:
+def parse_type_comment(type_comment: str,
+                       line: int,
+                       errors: Optional[Errors],
+                       assume_str_is_unicode: bool = True,
+                       ) -> Optional[Type]:
     try:
         typ = ast3.parse(type_comment, '<type_comment>', 'eval')
     except SyntaxError as e:
@@ -151,24 +156,39 @@ def parse_type_comment(type_comment: str, line: int, errors: Optional[Errors]) -
             raise
     else:
         assert isinstance(typ, ast3_Expression)
-        return TypeConverter(errors, line=line).visit(typ.body)
+        return TypeConverter(errors, line=line,
+                             assume_str_is_unicode=assume_str_is_unicode).visit(typ.body)
 
 
-def parse_type_string(expr_string: str, line: int, column: int) -> Type:
-    """Parses a type that was originally present inside of an explicit string.
+def parse_type_string(expr_string: str, expr_fallback_name: str,
+                      line: int, column: int, assume_str_is_unicode: bool = True) -> Type:
+    """Parses a type that was originally present inside of an explicit string,
+    byte string, or unicode string.
 
     For example, suppose we have the type `Foo["blah"]`. We should parse the
     string expression "blah" using this function.
+
+    If `assume_str_is_unicode` is set to true, this function will assume that
+    `Foo["blah"]` is equivalent to `Foo[u"blah"]`. Otherwise, it assumes it's
+    equivalent to `Foo[b"blah"]`.
+
+    The caller is responsible for keeping track of the context in which the
+    type string was encountered (e.g. in Python 3 code, Python 2 code, Python 2
+    code with unicode_literals...) and setting `assume_str_is_unicode` accordingly.
     """
     try:
-        node = parse_type_comment(expr_string.strip(), line=line, errors=None)
+        node = parse_type_comment(expr_string.strip(), line=line, errors=None,
+                                  assume_str_is_unicode=assume_str_is_unicode)
         if isinstance(node, UnboundType) and node.original_str_expr is None:
             node.original_str_expr = expr_string
+            node.original_str_fallback = expr_fallback_name
             return node
         else:
-            return RawLiteralType(expr_string, 'builtins.str', line, column)
-    except SyntaxError:
-        return RawLiteralType(expr_string, 'builtins.str', line, column)
+            return RawLiteralType(expr_string, expr_fallback_name, line, column)
+    except (SyntaxError, ValueError):
+        # Note: the parser will raise a `ValueError` instead of a SyntaxError if
+        # the string happens to contain things like \x00.
+        return RawLiteralType(expr_string, expr_fallback_name, line, column)
 
 
 def is_no_type_check_decorator(expr: ast3.expr) -> bool:
@@ -966,10 +986,7 @@ def visit_FormattedValue(self, n: ast3.FormattedValue) -> Expression:
 
     # Bytes(bytes s)
     def visit_Bytes(self, n: ast3.Bytes) -> Union[BytesExpr, StrExpr]:
-        # The following line is a bit hacky, but is the best way to maintain
-        # compatibility with how mypy currently parses the contents of bytes literals.
-        contents = str(n.s)[2:-1]
-        e = BytesExpr(contents)
+        e = BytesExpr(bytes_to_human_readable_repr(n.s))
         return self.set_line(e, n)
 
     # NameConstant(singleton value)
@@ -1042,10 +1059,15 @@ def visit_Index(self, n: Index) -> Node:
 
 
 class TypeConverter:
-    def __init__(self, errors: Optional[Errors], line: int = -1) -> None:
+    def __init__(self,
+                 errors: Optional[Errors],
+                 line: int = -1,
+                 assume_str_is_unicode: bool = True,
+                 ) -> None:
         self.errors = errors
         self.line = line
         self.node_stack = []  # type: List[AST]
+        self.assume_str_is_unicode = assume_str_is_unicode
 
     @overload
     def visit(self, node: ast3.expr) -> Type: ...
@@ -1090,8 +1112,11 @@ def visit_raw_str(self, s: str) -> Type:
         # An escape hatch that allows the AST walker in fastparse2 to
         # directly hook into the Python 3.5 type converter in some cases
         # without needing to create an intermediary `Str` object.
-        return (parse_type_comment(s.strip(), self.line, self.errors) or
-                AnyType(TypeOfAny.from_error))
+        return (parse_type_comment(s.strip(),
+                                   self.line,
+                                   self.errors,
+                                   self.assume_str_is_unicode)
+                or AnyType(TypeOfAny.from_error))
 
     def visit_Call(self, e: Call) -> Type:
         # Parse the arg constructor
@@ -1190,7 +1215,22 @@ def visit_Num(self, n: Num) -> Type:
 
     # Str(string s)
     def visit_Str(self, n: Str) -> Type:
-        return parse_type_string(n.s, line=self.line, column=-1)
+        # Note: we transform these fallback types into the correct types in
+        # 'typeanal.py' -- specifically in the named_type_with_normalized_str method.
+        # If we're analyzing Python 3, that function will translate 'builtins.unicode'
+        # into 'builtins.str'. In contrast, if we're analyzing Python 2 code, we'll
+        # translate 'builtins.bytes' in the method below into 'builtins.str'.
+        if 'u' in n.kind or self.assume_str_is_unicode:
+            return parse_type_string(n.s, 'builtins.unicode', self.line, n.col_offset,
+                                     assume_str_is_unicode=self.assume_str_is_unicode)
+        else:
+            return parse_type_string(n.s, 'builtins.str', self.line, n.col_offset,
+                                     assume_str_is_unicode=self.assume_str_is_unicode)
+
+    # Bytes(bytes s)
+    def visit_Bytes(self, n: Bytes) -> Type:
+        contents = bytes_to_human_readable_repr(n.s)
+        return RawLiteralType(contents, 'builtins.bytes', self.line, column=n.col_offset)
 
     # Subscript(expr value, slice slice, expr_context ctx)
     def visit_Subscript(self, n: ast3.Subscript) -> Type:
@@ -1246,3 +1286,17 @@ def stringify_name(n: AST) -> Optional[str]:
         if sv is not None:
             return "{}.{}".format(sv, n.attr)
     return None  # Can't do it.
+
+
+def bytes_to_human_readable_repr(b: bytes) -> str:
+    """Converts bytes into some human-readable representation. Unprintable
+    bytes such as the nul byte are escaped. For example:
+
+        >>> b = bytes([102, 111, 111, 10, 0])
+        >>> s = bytes_to_human_readable_repr(b)
+        >>> print(s)
+        foo\n\x00
+        >>> print(repr(s))
+        'foo\\n\\x00'
+    """
+    return str(b)[2:-1]
diff --git a/mypy/fastparse2.py b/mypy/fastparse2.py
@@ -45,7 +45,7 @@
 )
 from mypy import messages
 from mypy.errors import Errors
-from mypy.fastparse import TypeConverter, parse_type_comment
+from mypy.fastparse import TypeConverter, parse_type_comment, bytes_to_human_readable_repr
 from mypy.options import Options
 
 try:
@@ -113,7 +113,6 @@ def parse(source: Union[str, bytes],
         assert options.python_version[0] < 3 and not is_stub_file
         ast = ast27.parse(source, fnam, 'exec')
         tree = ASTConverter(options=options,
-                            is_stub=is_stub_file,
                             errors=errors,
                             ).visit(ast)
         assert isinstance(tree, MypyFile)
@@ -141,15 +140,32 @@ def is_no_type_check_decorator(expr: ast27.expr) -> bool:
 class ASTConverter:
     def __init__(self,
                  options: Options,
-                 is_stub: bool,
                  errors: Errors) -> None:
         self.class_nesting = 0
         self.imports = []  # type: List[ImportBase]
 
         self.options = options
-        self.is_stub = is_stub
         self.errors = errors
 
+        # Indicates whether this file is being parsed with unicode_literals enabled.
+        # Note: typed_ast already naturally takes unicode_literals into account when
+        # parsing so we don't have to worry when analyzing strings within this class.
+        #
+        # The only place where we use this field is when we call fastparse's TypeConverter
+        # and any related methods. That class accepts a Python 3 AST instead of a Python 2
+        # AST: as a result, it don't special-case the `unicode_literals` import and won't know
+        # exactly whether to parse some string as bytes or unicode.
+        #
+        # This distinction is relevant mostly when handling Literal types -- Literal[u"foo"]
+        # is not the same type as Literal[b"foo"], and Literal["foo"] could mean either the
+        # former or the latter based on context.
+        #
+        # This field is set in the 'visit_ImportFrom' method: it's ok to delay computing it
+        # because any `from __future__ import blah` import must be located at the top of the
+        # file, with the exception of the docstring. This means we're guaranteed to correctly
+        # set this field before we encounter any type hints.
+        self.unicode_literals = False
+
         # Cache of visit_X methods keyed by type of visited object
         self.visitor_cache = {}  # type: Dict[type, Callable[[Optional[AST]], Any]]
 
@@ -306,7 +322,8 @@ def visit_Module(self, mod: ast27.Module) -> MypyFile:
     #              arg? kwarg, expr* defaults)
     def visit_FunctionDef(self, n: ast27.FunctionDef) -> Statement:
         lineno = n.lineno
-        converter = TypeConverter(self.errors, line=lineno)
+        converter = TypeConverter(self.errors, line=lineno,
+                                  assume_str_is_unicode=self.unicode_literals)
         args, decompose_stmts = self.transform_args(n.args, lineno)
 
         arg_kinds = [arg.kind for arg in args]
@@ -413,7 +430,8 @@ def transform_args(self,
                        line: int,
                        ) -> Tuple[List[Argument], List[Statement]]:
         type_comments = n.type_comments  # type: Sequence[Optional[str]]
-        converter = TypeConverter(self.errors, line=line)
+        converter = TypeConverter(self.errors, line=line,
+                                  assume_str_is_unicode=self.unicode_literals)
         decompose_stmts = []  # type: List[Statement]
 
         n_args = n.args
@@ -532,7 +550,8 @@ def visit_Delete(self, n: ast27.Delete) -> DelStmt:
     def visit_Assign(self, n: ast27.Assign) -> AssignmentStmt:
         typ = None
         if n.type_comment:
-            typ = parse_type_comment(n.type_comment, n.lineno, self.errors)
+            typ = parse_type_comment(n.type_comment, n.lineno, self.errors,
+                                     assume_str_is_unicode=self.unicode_literals)
 
         stmt = AssignmentStmt(self.translate_expr_list(n.targets),
                               self.visit(n.value),
@@ -549,7 +568,8 @@ def visit_AugAssign(self, n: ast27.AugAssign) -> OperatorAssignmentStmt:
     # For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment)
     def visit_For(self, n: ast27.For) -> ForStmt:
         if n.type_comment is not None:
-            target_type = parse_type_comment(n.type_comment, n.lineno, self.errors)
+            target_type = parse_type_comment(n.type_comment, n.lineno, self.errors,
+                                             assume_str_is_unicode=self.unicode_literals)
         else:
             target_type = None
         stmt = ForStmt(self.visit(n.target),
@@ -576,7 +596,8 @@ def visit_If(self, n: ast27.If) -> IfStmt:
     # With(withitem* items, stmt* body, string? type_comment)
     def visit_With(self, n: ast27.With) -> WithStmt:
         if n.type_comment is not None:
-            target_type = parse_type_comment(n.type_comment, n.lineno, self.errors)
+            target_type = parse_type_comment(n.type_comment, n.lineno, self.errors,
+                                             assume_str_is_unicode=self.unicode_literals)
         else:
             target_type = None
         stmt = WithStmt([self.visit(n.context_expr)],
@@ -680,9 +701,12 @@ def visit_ImportFrom(self, n: ast27.ImportFrom) -> ImportBase:
             mod = n.module if n.module is not None else ''
             i = ImportAll(mod, n.level)  # type: ImportBase
         else:
-            i = ImportFrom(self.translate_module_id(n.module) if n.module is not None else '',
-                           n.level,
-                           [(a.name, a.asname) for a in n.names])
+            module_id = self.translate_module_id(n.module) if n.module is not None else ''
+            i = ImportFrom(module_id, n.level, [(a.name, a.asname) for a in n.names])
+
+            # See comments in the constructor for more information about this field.
+            if module_id == '__future__' and any(a.name == 'unicode_literals' for a in n.names):
+                self.unicode_literals = True
         self.imports.append(i)
         return self.set_line(i, n)
 
@@ -900,18 +924,17 @@ def visit_Num(self, n: ast27.Num) -> Expression:
 
     # Str(string s)
     def visit_Str(self, n: ast27.Str) -> Expression:
-        # Hack: assume all string literals in Python 2 stubs are normal
-        # strs (i.e. not unicode).  All stubs are parsed with the Python 3
-        # parser, which causes unprefixed string literals to be interpreted
-        # as unicode instead of bytes.  This hack is generally okay,
-        # because mypy considers str literals to be compatible with
-        # unicode.
+        # Note: typed_ast.ast27 will handled unicode_literals for us. If
+        # n.s is of type 'bytes', we know unicode_literals was not enabled;
+        # otherwise we know it was.
+        #
+        # Note that the following code is NOT run when parsing Python 2.7 stubs:
+        # we always parse stub files (no matter what version) using the Python 3
+        # parser. This is also why string literals in Python 2.7 stubs are assumed
+        # to be unicode.
         if isinstance(n.s, bytes):
-            value = n.s
-            # The following line is a bit hacky, but is the best way to maintain
-            # compatibility with how mypy currently parses the contents of bytes literals.
-            contents = str(value)[2:-1]
-            e = StrExpr(contents)  # type: Union[StrExpr, UnicodeExpr]
+            contents = bytes_to_human_readable_repr(n.s)
+            e = StrExpr(contents, from_python_3=False)  # type: Union[StrExpr, UnicodeExpr]
             return self.set_line(e, n)
         else:
             e = UnicodeExpr(n.s)

diff --git a/mypy/literals.py b/mypy/literals.py
@@ -98,7 +98,7 @@ def visit_int_expr(self, e: IntExpr) -> Key:
         return ('Literal', e.value)
 
     def visit_str_expr(self, e: StrExpr) -> Key:
-        return ('Literal', e.value)
+        return ('Literal', e.value, e.from_python_3)
 
     def visit_bytes_expr(self, e: BytesExpr) -> Key:
         return ('Literal', e.value)