Skip to content

Commit

Permalink
Streamline lexer.token production for lineno/colno
Browse files Browse the repository at this point in the history
- Define a standard function that will produce a token with those filled
  in.
- Implement line terminator sequence as per ECMA-262 specification.
- Also fix a very apparent bug in test case where the 'exc` string is on
  the next line but was reported to be part of the original line, one of
  various cases that will upset the line count.
  • Loading branch information
metatoaster committed Apr 22, 2018
1 parent 9d93523 commit 5074001
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 20 deletions.
43 changes: 24 additions & 19 deletions src/calmjs/parse/lexers/es5.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

__author__ = 'Ruslan Spivak <[email protected]>'

import re
import ply.lex

from calmjs.parse.lexers.tokens import AutoLexToken
Expand Down Expand Up @@ -71,6 +72,9 @@
'LINE_TERMINATOR', 'LINE_COMMENT', 'BLOCK_COMMENT'
])

PATT_LINE_TERMINATOR_SEQUENCE = re.compile(
ur'(\n|\r(?!\n)|\u2028|\u2029|\r\n)', flags=re.S)


class Lexer(object):
"""A JavaScript lexer.
Expand Down Expand Up @@ -141,16 +145,23 @@ def build(self, **kwargs):
def input(self, text):
self.lexer.input(text)

def _set_pos(self, token):
lines = token.value.splitlines(True)
def _update_newline_idx(self, token):
lexpos = token.lexpos
for line in lines:
if line[-1:] in '\r\n':
lexpos += len(line)
self.lexer.lineno += 1
self.newline_idx.append(lexpos)
fragments = PATT_LINE_TERMINATOR_SEQUENCE.split(token.value)
for fragment, newline in zip(*[iter(fragments)] * 2):
lexpos += len(fragment + newline)
self.lexer.lineno += 1
self.newline_idx.append(lexpos)

def get_lexer_token(self):
token = self.lexer.token()
if token:
token.colno = self._get_colno(token)
self._update_newline_idx(token)
return token

def token(self):
# auto-semi tokens that got added
if self.next_tokens:
return self.next_tokens.pop()

Expand All @@ -166,15 +177,13 @@ def token(self):
except IndexError:
tok = self._get_update_token()
if tok is not None and tok.type == 'LINE_TERMINATOR':
self._set_pos(tok)
continue
else:
return tok

if char != '/' or (char == '/' and next_char in ('/', '*')):
tok = self._get_update_token()
if tok.type in DIVISION_SYNTAX_MARKERS:
self._set_pos(tok)
continue
else:
return tok
Expand Down Expand Up @@ -226,12 +235,12 @@ def _is_prev_token_lt(self):

def _read_regex(self):
self.lexer.begin('regex')
token = self.lexer.token()
token = self.get_lexer_token()
self.lexer.begin('INITIAL')
return token

def _get_update_token(self):
self._set_tokens(self.lexer.token())
self._set_tokens(self.get_lexer_token())

if self.cur_token is not None:

Expand Down Expand Up @@ -259,7 +268,7 @@ def _get_update_token(self):
"Mismatched '%s' at %d:%d" % (
self.cur_token.value,
self.cur_token.lineno,
self._get_colno(self.cur_token),
self.cur_token.colno,
)
)

Expand All @@ -272,12 +281,7 @@ def _get_update_token(self):
'RETURN', 'THROW']):
return self._create_semi_token(self.cur_token)

return self._set_colno(self.cur_token)

def _set_colno(self, token):
if token:
token.colno = self._get_colno(token)
return token
return self.cur_token

def _get_colno(self, token):
# have a 1 offset to map nicer to commonly used/configured
Expand Down Expand Up @@ -465,7 +469,8 @@ def t_regex_error(self, token):
t_LINE_COMMENT = r'//[^\r\n]*'
t_BLOCK_COMMENT = r'/\*[^*]*\*+([^/*][^*]*\*+)*/'

t_LINE_TERMINATOR = r'[\n\r]+'
# 7.3 Line Terminators
t_LINE_TERMINATOR = r'\s'

t_ignore = (
# space, tab, line tab, form feed, nbsp
Expand Down
2 changes: 1 addition & 1 deletion src/calmjs/parse/tests/test_es5_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2550,7 +2550,7 @@ def regenerate(value):
throw
'exc';
""",
"Unexpected \"'exc'\" at 1:9 after 'throw' at 1:1",
"Unexpected \"'exc'\" at 2:3 after 'throw' at 1:1",
), (
'setter_single_arg',
"""
Expand Down

0 comments on commit 5074001

Please sign in to comment.