Skip to content

Commit

Permalink
chore(parser): optimize the most common lexer matches into lookup tab…
Browse files Browse the repository at this point in the history
…les (#814)

* Use lookup table for common ascii values

* Remove unecessary lifetime limit

---------

Co-authored-by: Renée <[email protected]>
  • Loading branch information
allancalix and goto-bus-stop authored Jan 31, 2024
1 parent e49b34d commit 12db7e8
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 91 deletions.
100 changes: 100 additions & 0 deletions crates/apollo-parser/src/lexer/lookup.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
use crate::TokenKind;

static PUNCTUATION_CHARS: [Option<TokenKind>; 256] = punctuation_lut();
static NAMESTART_CHARS: [bool; 256] = namestart_lut();

#[inline]
pub(crate) fn punctuation_kind(c: char) -> Option<TokenKind> {
if c.is_ascii() {
PUNCTUATION_CHARS[c as usize]
} else {
None
}
}

#[inline]
pub(crate) fn is_namestart(c: char) -> bool {
c.is_ascii() && NAMESTART_CHARS[c as usize]
}

const fn punctuation_lut() -> [Option<TokenKind>; 256] {
let mut lut = [None; 256];
lut[b'{' as usize] = Some(TokenKind::LCurly);
lut[b'}' as usize] = Some(TokenKind::RCurly);
lut[b'!' as usize] = Some(TokenKind::Bang);
lut[b'$' as usize] = Some(TokenKind::Dollar);
lut[b'&' as usize] = Some(TokenKind::Amp);
lut[b'(' as usize] = Some(TokenKind::LParen);
lut[b')' as usize] = Some(TokenKind::RParen);
lut[b':' as usize] = Some(TokenKind::Colon);
lut[b',' as usize] = Some(TokenKind::Comma);
lut[b'[' as usize] = Some(TokenKind::LBracket);
lut[b']' as usize] = Some(TokenKind::RBracket);
lut[b'=' as usize] = Some(TokenKind::Eq);
lut[b'@' as usize] = Some(TokenKind::At);
lut[b'|' as usize] = Some(TokenKind::Pipe);

lut
}

/// <https://spec.graphql.org/October2021/#NameStart>
const fn namestart_lut() -> [bool; 256] {
let mut lut = [false; 256];
lut[b'a' as usize] = true;
lut[b'b' as usize] = true;
lut[b'c' as usize] = true;
lut[b'd' as usize] = true;
lut[b'e' as usize] = true;
lut[b'f' as usize] = true;
lut[b'g' as usize] = true;
lut[b'h' as usize] = true;
lut[b'i' as usize] = true;
lut[b'j' as usize] = true;
lut[b'k' as usize] = true;
lut[b'l' as usize] = true;
lut[b'm' as usize] = true;
lut[b'n' as usize] = true;
lut[b'o' as usize] = true;
lut[b'p' as usize] = true;
lut[b'q' as usize] = true;
lut[b'r' as usize] = true;
lut[b's' as usize] = true;
lut[b't' as usize] = true;
lut[b'u' as usize] = true;
lut[b'v' as usize] = true;
lut[b'w' as usize] = true;
lut[b'x' as usize] = true;
lut[b'y' as usize] = true;
lut[b'z' as usize] = true;

lut[b'A' as usize] = true;
lut[b'B' as usize] = true;
lut[b'C' as usize] = true;
lut[b'D' as usize] = true;
lut[b'E' as usize] = true;
lut[b'F' as usize] = true;
lut[b'G' as usize] = true;
lut[b'H' as usize] = true;
lut[b'I' as usize] = true;
lut[b'J' as usize] = true;
lut[b'K' as usize] = true;
lut[b'L' as usize] = true;
lut[b'M' as usize] = true;
lut[b'N' as usize] = true;
lut[b'O' as usize] = true;
lut[b'P' as usize] = true;
lut[b'Q' as usize] = true;
lut[b'R' as usize] = true;
lut[b'S' as usize] = true;
lut[b'T' as usize] = true;
lut[b'U' as usize] = true;
lut[b'V' as usize] = true;
lut[b'W' as usize] = true;
lut[b'X' as usize] = true;
lut[b'Y' as usize] = true;
lut[b'Z' as usize] = true;

lut[b'_' as usize] = true;

lut
}
118 changes: 28 additions & 90 deletions crates/apollo-parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
mod cursor;
mod lookup;
mod token;
mod token_kind;

Expand Down Expand Up @@ -146,6 +147,26 @@ impl<'a> Cursor<'a> {
};
match state {
State::Start => {
if let Some(t) = lookup::punctuation_kind(c) {
token.kind = t;
token.data = self.current_str();
return Ok(token);
}

if lookup::is_namestart(c) {
token.kind = TokenKind::Name;
state = State::Ident;

continue;
}

if c != '0' && c.is_ascii_digit() {
token.kind = TokenKind::Int;
state = State::IntegerPart;

continue;
}

match c {
'"' => {
token.kind = TokenKind::StringValue;
Expand All @@ -159,14 +180,6 @@ impl<'a> Cursor<'a> {
token.kind = TokenKind::Spread;
state = State::SpreadOperator;
}
c if is_whitespace_assimilated(c) => {
token.kind = TokenKind::Whitespace;
state = State::Whitespace;
}
c if is_name_start(c) => {
token.kind = TokenKind::Name;
state = State::Ident;
}
'-' => {
token.kind = TokenKind::Int;
state = State::MinusSign;
Expand All @@ -175,79 +188,9 @@ impl<'a> Cursor<'a> {
token.kind = TokenKind::Int;
state = State::LeadingZero;
}
c if c.is_ascii_digit() => {
token.kind = TokenKind::Int;
state = State::IntegerPart;
}
'!' => {
token.kind = TokenKind::Bang;
token.data = self.current_str();
return Ok(token);
}
'$' => {
token.kind = TokenKind::Dollar;
token.data = self.current_str();
return Ok(token);
}
'&' => {
token.kind = TokenKind::Amp;
token.data = self.current_str();
return Ok(token);
}
'(' => {
token.kind = TokenKind::LParen;
token.data = self.current_str();
return Ok(token);
}
')' => {
token.kind = TokenKind::RParen;
token.data = self.current_str();
return Ok(token);
}
':' => {
token.kind = TokenKind::Colon;
token.data = self.current_str();
return Ok(token);
}
',' => {
token.kind = TokenKind::Comma;
token.data = self.current_str();
return Ok(token);
}
'=' => {
token.kind = TokenKind::Eq;
token.data = self.current_str();
return Ok(token);
}
'@' => {
token.kind = TokenKind::At;
token.data = self.current_str();
return Ok(token);
}
'[' => {
token.kind = TokenKind::LBracket;
token.data = self.current_str();
return Ok(token);
}
']' => {
token.kind = TokenKind::RBracket;
token.data = self.current_str();
return Ok(token);
}
'{' => {
token.kind = TokenKind::LCurly;
token.data = self.current_str();
return Ok(token);
}
'|' => {
token.kind = TokenKind::Pipe;
token.data = self.current_str();
return Ok(token);
}
'}' => {
token.kind = TokenKind::RCurly;
token.data = self.current_str();
return Ok(token);
c if is_whitespace_assimilated(c) => {
token.kind = TokenKind::Whitespace;
state = State::Whitespace;
}
c => {
return Err(Error::new(
Expand Down Expand Up @@ -412,7 +355,7 @@ impl<'a> Cursor<'a> {
self.current_str().to_string(),
));
}
_ if is_name_start(c) => {
_ if lookup::is_namestart(c) => {
return Err(Error::new(
format!("Unexpected character `{c}` as integer suffix"),
self.current_str().to_string(),
Expand All @@ -433,7 +376,7 @@ impl<'a> Cursor<'a> {
token.kind = TokenKind::Float;
state = State::ExponentIndicator;
}
_ if is_name_start(c) => {
_ if lookup::is_namestart(c) => {
return Err(Error::new(
format!("Unexpected character `{c}` as integer suffix"),
self.current_str().to_string(),
Expand All @@ -460,7 +403,7 @@ impl<'a> Cursor<'a> {
'e' | 'E' => {
state = State::ExponentIndicator;
}
_ if c == '.' || is_name_start(c) => {
_ if c == '.' || lookup::is_namestart(c) => {
return Err(Error::new(
format!("Unexpected character `{c}` as float suffix"),
self.current_str().to_string(),
Expand Down Expand Up @@ -500,7 +443,7 @@ impl<'a> Cursor<'a> {
_ if c.is_ascii_digit() => {
state = State::ExponentDigit;
}
_ if c == '.' || is_name_start(c) => {
_ if c == '.' || lookup::is_namestart(c) => {
return Err(Error::new(
format!("Unexpected character `{c}` as float suffix"),
self.current_str().to_string(),
Expand Down Expand Up @@ -641,11 +584,6 @@ fn is_whitespace_assimilated(c: char) -> bool {
)
}

/// <https://spec.graphql.org/October2021/#NameStart>
fn is_name_start(c: char) -> bool {
matches!(c, 'a'..='z' | 'A'..='Z' | '_')
}

/// <https://spec.graphql.org/October2021/#NameContinue>
fn is_name_continue(c: char) -> bool {
matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
Expand Down
2 changes: 1 addition & 1 deletion crates/apollo-parser/src/lexer/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ impl<'a> Token<'a> {
}

/// Get a reference to the token's data.
pub fn data(&self) -> &str {
pub fn data(&self) -> &'a str {
self.data
}

Expand Down

0 comments on commit 12db7e8

Please sign in to comment.