Streamline lexer.token production for lineno/colno

- Define a standard function that will produce a token with those filled in. - Implement line terminator sequence as per ECMA-262 specification. - Also fix a very apparent bug in test case where the 'exc` string is on the next line but was reported to be part of the original line, one of various cases that will upset the line count.
metatoaster · Apr 22, 2018 · 5074001 · 5074001
1 parent 9d93523
commit 5074001
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 20 deletions.
diff --git a/src/calmjs/parse/lexers/es5.py b/src/calmjs/parse/lexers/es5.py
@@ -24,6 +24,7 @@
 
 __author__ = 'Ruslan Spivak <[email protected]>'
 
+import re
 import ply.lex
 
 from calmjs.parse.lexers.tokens import AutoLexToken
@@ -71,6 +72,9 @@
     'LINE_TERMINATOR', 'LINE_COMMENT', 'BLOCK_COMMENT'
 ])
 
+PATT_LINE_TERMINATOR_SEQUENCE = re.compile(
+    ur'(\n|\r(?!\n)|\u2028|\u2029|\r\n)', flags=re.S)
+
 
 class Lexer(object):
     """A JavaScript lexer.
@@ -141,16 +145,23 @@ def build(self, **kwargs):
     def input(self, text):
         self.lexer.input(text)
 
-    def _set_pos(self, token):
-        lines = token.value.splitlines(True)
+    def _update_newline_idx(self, token):
         lexpos = token.lexpos
-        for line in lines:
-            if line[-1:] in '\r\n':
-                lexpos += len(line)
-                self.lexer.lineno += 1
-                self.newline_idx.append(lexpos)
+        fragments = PATT_LINE_TERMINATOR_SEQUENCE.split(token.value)
+        for fragment, newline in zip(*[iter(fragments)] * 2):
+            lexpos += len(fragment + newline)
+            self.lexer.lineno += 1
+            self.newline_idx.append(lexpos)
+
+    def get_lexer_token(self):
+        token = self.lexer.token()
+        if token:
+            token.colno = self._get_colno(token)
+            self._update_newline_idx(token)
+        return token
 
     def token(self):
+        # auto-semi tokens that got added
         if self.next_tokens:
             return self.next_tokens.pop()
 
@@ -166,15 +177,13 @@ def token(self):
             except IndexError:
                 tok = self._get_update_token()
                 if tok is not None and tok.type == 'LINE_TERMINATOR':
-                    self._set_pos(tok)
                     continue
                 else:
                     return tok
 
             if char != '/' or (char == '/' and next_char in ('/', '*')):
                 tok = self._get_update_token()
                 if tok.type in DIVISION_SYNTAX_MARKERS:
-                    self._set_pos(tok)
                     continue
                 else:
                     return tok
@@ -226,12 +235,12 @@ def _is_prev_token_lt(self):
 
     def _read_regex(self):
         self.lexer.begin('regex')
-        token = self.lexer.token()
+        token = self.get_lexer_token()
         self.lexer.begin('INITIAL')
         return token
 
     def _get_update_token(self):
-        self._set_tokens(self.lexer.token())
+        self._set_tokens(self.get_lexer_token())
 
         if self.cur_token is not None:
 
@@ -259,7 +268,7 @@ def _get_update_token(self):
                     "Mismatched '%s' at %d:%d" % (
                         self.cur_token.value,
                         self.cur_token.lineno,
-                        self._get_colno(self.cur_token),
+                        self.cur_token.colno,
                     )
                 )
 
@@ -272,12 +281,7 @@ def _get_update_token(self):
                                          'RETURN', 'THROW']):
             return self._create_semi_token(self.cur_token)
 
-        return self._set_colno(self.cur_token)
-
-    def _set_colno(self, token):
-        if token:
-            token.colno = self._get_colno(token)
-        return token
+        return self.cur_token
 
     def _get_colno(self, token):
         # have a 1 offset to map nicer to commonly used/configured
@@ -465,7 +469,8 @@ def t_regex_error(self, token):
     t_LINE_COMMENT  = r'//[^\r\n]*'
     t_BLOCK_COMMENT = r'/\*[^*]*\*+([^/*][^*]*\*+)*/'
 
-    t_LINE_TERMINATOR = r'[\n\r]+'
+    # 7.3 Line Terminators
+    t_LINE_TERMINATOR = r'\s'
 
     t_ignore = (
         # space, tab, line tab, form feed, nbsp

diff --git a/src/calmjs/parse/tests/test_es5_parser.py b/src/calmjs/parse/tests/test_es5_parser.py
@@ -2550,7 +2550,7 @@ def regenerate(value):
         throw
           'exc';
         """,
-        "Unexpected \"'exc'\" at 1:9 after 'throw' at 1:1",
+        "Unexpected \"'exc'\" at 2:3 after 'throw' at 1:1",
     ), (
         'setter_single_arg',
         """
-Original file line number
+Diff line change
@@ Expand Up / @@ -2550,7 +2550,7 @@ def regenerate(value): @@
             throw
               'exc';
             """,
-            "Unexpected \"'exc'\" at 1:9 after 'throw' at 1:1",
+            "Unexpected \"'exc'\" at 2:3 after 'throw' at 1:1",
         ), (
             'setter_single_arg',
             """
@@ Expand Down @@