forked from calmjs/calmjs.parse
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Streamline lexer.token production for lineno/colno
- Define a standard function that will produce a token with those filled in. - Implement line terminator sequence as per ECMA-262 specification. - Also fix a very apparent bug in test case where the 'exc` string is on the next line but was reported to be part of the original line, one of various cases that will upset the line count.
- Loading branch information
1 parent
9d93523
commit 5074001
Showing
2 changed files
with
25 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,7 @@ | |
|
||
__author__ = 'Ruslan Spivak <[email protected]>' | ||
|
||
import re | ||
import ply.lex | ||
|
||
from calmjs.parse.lexers.tokens import AutoLexToken | ||
|
@@ -71,6 +72,9 @@ | |
'LINE_TERMINATOR', 'LINE_COMMENT', 'BLOCK_COMMENT' | ||
]) | ||
|
||
PATT_LINE_TERMINATOR_SEQUENCE = re.compile( | ||
ur'(\n|\r(?!\n)|\u2028|\u2029|\r\n)', flags=re.S) | ||
|
||
|
||
class Lexer(object): | ||
"""A JavaScript lexer. | ||
|
@@ -141,16 +145,23 @@ def build(self, **kwargs): | |
def input(self, text): | ||
self.lexer.input(text) | ||
|
||
def _set_pos(self, token): | ||
lines = token.value.splitlines(True) | ||
def _update_newline_idx(self, token): | ||
lexpos = token.lexpos | ||
for line in lines: | ||
if line[-1:] in '\r\n': | ||
lexpos += len(line) | ||
self.lexer.lineno += 1 | ||
self.newline_idx.append(lexpos) | ||
fragments = PATT_LINE_TERMINATOR_SEQUENCE.split(token.value) | ||
for fragment, newline in zip(*[iter(fragments)] * 2): | ||
lexpos += len(fragment + newline) | ||
self.lexer.lineno += 1 | ||
self.newline_idx.append(lexpos) | ||
|
||
def get_lexer_token(self): | ||
token = self.lexer.token() | ||
if token: | ||
token.colno = self._get_colno(token) | ||
self._update_newline_idx(token) | ||
return token | ||
|
||
def token(self): | ||
# auto-semi tokens that got added | ||
if self.next_tokens: | ||
return self.next_tokens.pop() | ||
|
||
|
@@ -166,15 +177,13 @@ def token(self): | |
except IndexError: | ||
tok = self._get_update_token() | ||
if tok is not None and tok.type == 'LINE_TERMINATOR': | ||
self._set_pos(tok) | ||
continue | ||
else: | ||
return tok | ||
|
||
if char != '/' or (char == '/' and next_char in ('/', '*')): | ||
tok = self._get_update_token() | ||
if tok.type in DIVISION_SYNTAX_MARKERS: | ||
self._set_pos(tok) | ||
continue | ||
else: | ||
return tok | ||
|
@@ -226,12 +235,12 @@ def _is_prev_token_lt(self): | |
|
||
def _read_regex(self): | ||
self.lexer.begin('regex') | ||
token = self.lexer.token() | ||
token = self.get_lexer_token() | ||
self.lexer.begin('INITIAL') | ||
return token | ||
|
||
def _get_update_token(self): | ||
self._set_tokens(self.lexer.token()) | ||
self._set_tokens(self.get_lexer_token()) | ||
|
||
if self.cur_token is not None: | ||
|
||
|
@@ -259,7 +268,7 @@ def _get_update_token(self): | |
"Mismatched '%s' at %d:%d" % ( | ||
self.cur_token.value, | ||
self.cur_token.lineno, | ||
self._get_colno(self.cur_token), | ||
self.cur_token.colno, | ||
) | ||
) | ||
|
||
|
@@ -272,12 +281,7 @@ def _get_update_token(self): | |
'RETURN', 'THROW']): | ||
return self._create_semi_token(self.cur_token) | ||
|
||
return self._set_colno(self.cur_token) | ||
|
||
def _set_colno(self, token): | ||
if token: | ||
token.colno = self._get_colno(token) | ||
return token | ||
return self.cur_token | ||
|
||
def _get_colno(self, token): | ||
# have a 1 offset to map nicer to commonly used/configured | ||
|
@@ -465,7 +469,8 @@ def t_regex_error(self, token): | |
t_LINE_COMMENT = r'//[^\r\n]*' | ||
t_BLOCK_COMMENT = r'/\*[^*]*\*+([^/*][^*]*\*+)*/' | ||
|
||
t_LINE_TERMINATOR = r'[\n\r]+' | ||
# 7.3 Line Terminators | ||
t_LINE_TERMINATOR = r'\s' | ||
|
||
t_ignore = ( | ||
# space, tab, line tab, form feed, nbsp | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters