Rework correction of line continuation in strings

- Actually provide line continuation in strings that complies to the specification, which the fix provided by rspivak/slimit#24 was problematic (breaks position tracking) and incomplete as it amounted to no more than a clumsy workaround (come on, duplicated regex code). - Also provide the expected minified output that would strip out these line continuation fragments.
metatoaster · Apr 22, 2018 · 3e5ee6c · 3e5ee6c
1 parent 5074001
commit 3e5ee6c
Show file tree

Hide file tree

Showing 11 changed files with 115 additions and 36 deletions.
diff --git a/src/calmjs/parse/handlers/core.py b/src/calmjs/parse/handlers/core.py
@@ -33,6 +33,7 @@
     Indent,
     Dedent,
 )
+from calmjs.parse.lexers.es5 import PATT_LINE_CONTINUATION
 
 required_space = re.compile(r'^(?:\w\w|\+\+|\-\-)$')
 
@@ -179,6 +180,11 @@ def layout_handler_space_minimum(dispatcher, node, before, after, prev):
         yield space_imply
 
 
+def deferrable_handler_literal_continuation(dispatcher, node):
+    # assume the es5 method will continue to work.
+    return PATT_LINE_CONTINUATION.sub('', node.value)
+
+
 def default_rules():
     return {'layout_handlers': {
         OpenBlock: layout_handler_openbrace,

diff --git a/src/calmjs/parse/lexers/es5.py b/src/calmjs/parse/lexers/es5.py
@@ -74,6 +74,8 @@
 
 PATT_LINE_TERMINATOR_SEQUENCE = re.compile(
     ur'(\n|\r(?!\n)|\u2028|\u2029|\r\n)', flags=re.S)
+PATT_LINE_CONTINUATION = re.compile(
+    ur'\\(\n|\r(?!\n)|\u2028|\u2029|\r\n)', flags=re.S)
 
 
 class Lexer(object):
@@ -505,57 +507,36 @@ def t_regex_error(self, token):
     )
     """
 
-    string = r"""
+    string = ur"""
     (?:
         # double quoted string
         (?:"                               # opening double quote
-            (?: [^"\\\n\r]                 # no \, line terminators or "
-                | \\[a-zA-Z!-\/:-@\[-`{-~] # or escaped characters
+            (?: [^"\\\n\r\u2028\u2029]     # no \, line terminators or "
+                | \\(\n|\r(?!\n)|\u2028|\u2029|\r\n)  # or line continuation
+                | \\[a-zA-Z!-\/:-@\[-`{-~] # or escaped characters or
                 | \\x[0-9a-fA-F]{2}        # or hex_escape_sequence
                 | \\u[0-9a-fA-F]{4}        # or unicode_escape_sequence
                 | \\(?:[1-7][0-7]{0,2}|[0-7]{2,3})  # or octal_escape_sequence
                 | \\0                      # or <NUL> (15.10.2.11)
             )*?                            # zero or many times
-            (?: \\\n                       # multiline ?
-              (?:
-                [^"\\\n\r]                 # no \, line terminators or "
-                | \\[a-zA-Z!-\/:-@\[-`{-~] # or escaped characters
-                | \\x[0-9a-fA-F]{2}        # or hex_escape_sequence
-                | \\u[0-9a-fA-F]{4}        # or unicode_escape_sequence
-                | \\(?:[1-7][0-7]{0,2}|[0-7]{2,3}) # or octal_escape_sequence
-                | \\0                      # or <NUL> (15.10.2.11)
-              )*?                          # zero or many times
-            )*
         ")                                 # closing double quote
         |
         # single quoted string
         (?:'                               # opening single quote
-            (?: [^'\\\n\r]                 # no \, line terminators or '
+            (?: [^'\\\n\r\u2028\u2029]     # no \, line terminators or "
+                | \\(\n|\r(?!\n)|\u2028|\u2029|\r\n)  # or line continuation
                 | \\[a-zA-Z!-\/:-@\[-`{-~] # or escaped characters
                 | \\x[0-9a-fA-F]{2}        # or hex_escape_sequence
                 | \\u[0-9a-fA-F]{4}        # or unicode_escape_sequence
                 | \\(?:[1-7][0-7]{0,2}|[0-7]{2,3}) # or octal_escape_sequence
                 | \\0                      # or <NUL> (15.10.2.11)
             )*?                            # zero or many times
-            (?: \\\n                       # multiline ?
-              (?:
-                [^'\\\n\r]                 # no \, line terminators or '
-                | \\[a-zA-Z!-\/:-@\[-`{-~] # or escaped characters
-                | \\x[0-9a-fA-F]{2}        # or hex_escape_sequence
-                | \\u[0-9a-fA-F]{4}        # or unicode_escape_sequence
-                | \\(?:[1-7][0-7]{0,2}|[0-7]{2,3}) # or octal_escape_sequence
-                | \\0                      # or <NUL> (15.10.2.11)
-              )*?                          # zero or many times
-            )*
         ')                                 # closing single quote
     )
     """  # "
 
     @ply.lex.TOKEN(string)
     def t_STRING(self, token):
-        # remove escape + new line sequence used for strings
-        # written across multiple lines of code
-        token.value = token.value.replace('\\\n', '')
         return token
 
     # XXX: <ZWNJ> <ZWJ> ?

diff --git a/src/calmjs/parse/rules.py b/src/calmjs/parse/rules.py
@@ -19,6 +19,7 @@
     Indent,
     Dedent,
     Resolve,
+    Literal,
 )
 from calmjs.parse.handlers.core import (
     rule_handler_noop,
@@ -34,6 +35,8 @@
     layout_handler_space_optional_pretty,
     layout_handler_space_minimum,
 
+    deferrable_handler_literal_continuation,
+
     default_rules,
     minimum_rules,
 )
@@ -111,7 +114,12 @@ def minify(drop_semi=True):
         })
 
     def minify_rule():
-        return {'layout_handlers': layout_handlers}
+        return {
+            'layout_handlers': layout_handlers,
+            'deferrable_handlers': {
+                Literal: deferrable_handler_literal_continuation,
+            },
+        }
 
     return minify_rule
 

diff --git a/src/calmjs/parse/ruletypes.py b/src/calmjs/parse/ruletypes.py
@@ -478,5 +478,18 @@ def __call__(self, dispatcher, node):
         return node.value
 
 
+class Literal(Deferrable):
+    """
+    Provides special handling for literals such as strings.
+    """
+
+    def __call__(self, dispatcher, node):
+        handler = dispatcher.deferrable(self)
+        if handler is not NotImplemented:
+            # the handler will return the value
+            return handler(dispatcher, node)
+        return node.value
+
+
 children_newline = JoinAttr(Iter(), value=(Newline,))
 children_comma = JoinAttr(Iter(), value=(Text(value=','), Space,))
diff --git a/src/calmjs/parse/tests/lexer.py b/src/calmjs/parse/tests/lexer.py
@@ -117,14 +117,12 @@
         # multiline string (string written across multiple lines
         # of code) https://github.com/rspivak/slimit/issues/24
         'slimit_issue_24_multi_line_code_double',
-        (r"""var a = 'hello \
-world'""",
-         ['VAR var', 'ID a', 'EQ =', "STRING 'hello world'"]),
+        ("var a = 'hello \\\n world'""",
+         ['VAR var', 'ID a', 'EQ =', "STRING 'hello \\\n world'"]),
     ), (
         'slimit_issue_24_multi_line_code_single',
-        (r'''var a = "hello \
-world"''',
-         ['VAR var', 'ID a', 'EQ =', 'STRING "hello world"']),
+        ('var a = "hello \\\r world"',
+         ['VAR var', 'ID a', 'EQ =', 'STRING "hello \\\r world"']),
     ), (
         # # Comments
         # ("""
@@ -354,6 +352,30 @@
           'PERIOD .', 'ID split', 'LPAREN (', r"STRING '\1'", 'RPAREN )',
           'PERIOD .', 'ID split', 'LPAREN (', r"STRING '\0'", 'RPAREN )',
           'SEMI ;'])
+    ), (
+        'section_7_8_4_string_literal_with_7_3_conformance',
+        ("'<LF>\\\n<CR>\\\r<LS>\\\u2028<PS>\\\u2029<CR><LF>\\\r\n'",
+         ["STRING '<LF>\\\n<CR>\\\r<LS>\\\u2028<PS>\\\u2029<CR><LF>\\\r\n'"])
+    ),
+]
+
+es5_error_cases = [
+    (
+        'naked_line_separator_in_string',
+        "'test\u2028foo'",
+        'Illegal character "\'" at 1:1',
+    ), (
+        'naked_line_feed_in_string',
+        "'test\u2029foo'",
+        'Illegal character "\'" at 1:1',
+    ), (
+        'naked_crnl_in_string',
+        "'test\r\nfoo'",
+        'Illegal character "\'" at 1:1',
+    ), (
+        'naked_cr_in_string',
+        "'test\\\n\rfoo'",
+        'Illegal character "\'" at 1:1',
     )
 ]
 

diff --git a/src/calmjs/parse/tests/test_es5_lexer.py b/src/calmjs/parse/tests/test_es5_lexer.py
@@ -32,11 +32,13 @@
 from calmjs.parse.exceptions import ECMASyntaxError
 
 from calmjs.parse.testing.util import build_equality_testcase
+from calmjs.parse.testing.util import build_exception_testcase
 from calmjs.parse.tests.lexer import (
     run_lexer,
     run_lexer_pos,
     es5_cases,
     es5_pos_cases,
+    es5_error_cases,
 )
 
 
@@ -68,7 +70,7 @@ def test_extra_ending_braces(self):
 
 
 LexerKeywordTestCase = build_equality_testcase(
-    'LexerTestCase', partial(run_lexer, lexer_cls=Lexer), (
+    'LexerKeywordTestCase', partial(run_lexer, lexer_cls=Lexer), (
         (label, data[0], data[1],) for label, data in [(
             # Keywords
             # ('break case ...', ['BREAK break', 'CASE case', ...])
@@ -84,6 +86,10 @@ def test_extra_ending_braces(self):
     'LexerTestCase', partial(run_lexer, lexer_cls=Lexer), (
         (label, data[0], data[1],) for label, data in es5_cases))
 
+LexerErrorTestCase = build_exception_testcase(
+    'LexerErrorTestCase', partial(
+        run_lexer, lexer_cls=Lexer), es5_error_cases, ECMASyntaxError)
+
 LexerPosTestCase = build_equality_testcase(
     'LexerPosTestCase', partial(
         run_lexer_pos, lexer_cls=Lexer), es5_pos_cases)
diff --git a/src/calmjs/parse/tests/test_es5_parser.py b/src/calmjs/parse/tests/test_es5_parser.py
@@ -2551,6 +2551,15 @@ def regenerate(value):
           'exc';
         """,
         "Unexpected \"'exc'\" at 2:3 after 'throw' at 1:1",
+    ), (
+        # note that the line continuation do not swallow
+        'throw_error_after_line_continuation_lineno',
+        r"""
+        s = 'something \
+        '
+        throw;
+        """,
+        "Unexpected ';' at 3:6 after 'throw' at 3:1",
     ), (
         'setter_single_arg',
         """

diff --git a/src/calmjs/parse/tests/test_es5_unparser.py b/src/calmjs/parse/tests/test_es5_unparser.py
@@ -1759,6 +1759,14 @@ def parse_to_sourcemap_tokens_min(text):
         """
         for (index in [1, 2, 3]) /^salign$/;
         """,
+    ), (
+        'line_continuation_string',
+        r"""
+        {
+          var a = "\
+          ";
+        }
+        """,
     )]))
 )
 
@@ -2147,6 +2155,13 @@ def parse_to_sourcemap_tokens_minify(text):
         })();
         """,
         '(function $(){(function a(){var a=1;})();})();',
+    ), (
+        'line_continuation_string',
+        r"""
+        var a = "\
+          ";
+        """,
+        'var a="  ";',
     )])
 )
 

diff --git a/src/calmjs/parse/tests/test_handlers_simple.py b/src/calmjs/parse/tests/test_handlers_simple.py
@@ -9,6 +9,7 @@
     layout_handler_openbrace,
     layout_handler_closebrace,
     layout_handler_semicolon,
+    deferrable_handler_literal_continuation,
 )
 from calmjs.parse.unparsers.walker import Dispatcher
 
@@ -151,3 +152,10 @@ def run(before, after, prev):
 
         # The first layout rule
         self.assertEqual(run(';', 'function', None), newline)
+
+    def test_deferrable_handler_literal_continuation(self):
+        dispatcher = Dispatcher({}, None, {}, {})
+        node = Node()
+        node.value = '"foo\\\r\nbar"'
+        self.assertEqual('"foobar"', deferrable_handler_literal_continuation(
+            dispatcher, node))
diff --git a/src/calmjs/parse/tests/test_ruletypes.py b/src/calmjs/parse/tests/test_ruletypes.py
@@ -2,8 +2,10 @@
 import unittest
 
 from calmjs.parse.asttypes import Identifier
+from calmjs.parse.asttypes import String
 from calmjs.parse.ruletypes import Declare
 from calmjs.parse.ruletypes import Resolve
+from calmjs.parse.ruletypes import Literal
 
 
 class Node(object):
@@ -119,3 +121,11 @@ def test_resolve(self):
         # if the handler is not implemented
         self.handler = NotImplemented
         self.assertEqual('value', rslv(self.dispatcher, identifier))
+
+    def test_literal_string(self):
+        literal = Literal()
+        string = String('"value"')
+        self.assertEqual('"VALUE"', literal(self.dispatcher, string))
+        # if the handler is not implemented
+        self.handler = NotImplemented
+        self.assertEqual('"value"', literal(self.dispatcher, string))
diff --git a/src/calmjs/parse/unparsers/es5.py b/src/calmjs/parse/unparsers/es5.py
@@ -32,6 +32,7 @@
     ElisionJoinAttr,
 )
 from calmjs.parse.ruletypes import (
+    Literal,
     Declare,
     Resolve,
     ResolveFuncName,
@@ -154,7 +155,7 @@
         Text(value='null'),
     ),
     'String': (
-        Attr(attr='value'),
+        Attr(Literal()),
     ),
     'Continue': (
         Text(value='continue'), Optional('identifier', (