Merge #56

56: Unify lookahead naming between parser and lexer. r=matklad a=zachlute Resolves Issue #26. I wanted to play around with libsyntax2, and fixing a random issue seemed like a good way to mess around in the code. This PR mostly does what's suggested in that issue. I elected to go with `at` and `at_str` instead of trying to do any fancy overloading shenanigans, because...uh, well, frankly I don't really know how to do any fancy overloading shenanigans. The only really questionable bit is `nth_is_p`, which could also have potentially been named `nth_at_p`, but `is` seemed more apropos. I also added simple tests for `Ptr` so I could be less terrified I broke something. Comments and criticisms very welcome. I'm still pretty new to Rust. Co-authored-by: Zach Lute <[email protected]>
rust-lang · Sep 5, 2018 · ad45168 · ad45168
2 parents 649f7fa + d21fead
commit ad45168
Show file tree

Hide file tree

Showing 5 changed files with 137 additions and 45 deletions.
diff --git a/crates/libsyntax2/src/lexer/comments.rs b/crates/libsyntax2/src/lexer/comments.rs
@@ -3,7 +3,7 @@ use lexer::ptr::Ptr;
 use SyntaxKind::{self, *};
 
 pub(crate) fn scan_shebang(ptr: &mut Ptr) -> bool {
-    if ptr.next_is('!') && ptr.nnext_is('/') {
+    if ptr.at_str("!/") {
         ptr.bump();
         ptr.bump();
         bump_until_eol(ptr);
@@ -14,15 +14,15 @@ pub(crate) fn scan_shebang(ptr: &mut Ptr) -> bool {
 }
 
 fn scan_block_comment(ptr: &mut Ptr) -> Option<SyntaxKind> {
-    if ptr.next_is('*') {
+    if ptr.at('*') {
         ptr.bump();
         let mut depth: u32 = 1;
         while depth > 0 {
-            if ptr.next_is('*') && ptr.nnext_is('/') {
+            if ptr.at_str("*/") {
                 depth -= 1;
                 ptr.bump();
                 ptr.bump();
-            } else if ptr.next_is('/') && ptr.nnext_is('*') {
+            } else if ptr.at_str("/*") {
                 depth += 1;
                 ptr.bump();
                 ptr.bump();
@@ -37,7 +37,7 @@ fn scan_block_comment(ptr: &mut Ptr) -> Option<SyntaxKind> {
 }
 
 pub(crate) fn scan_comment(ptr: &mut Ptr) -> Option<SyntaxKind> {
-    if ptr.next_is('/') {
+    if ptr.at('/') {
         bump_until_eol(ptr);
         Some(COMMENT)
     } else {
@@ -47,7 +47,7 @@ pub(crate) fn scan_comment(ptr: &mut Ptr) -> Option<SyntaxKind> {
 
 fn bump_until_eol(ptr: &mut Ptr) {
     loop {
-        if ptr.next_is('\n') || ptr.next_is('\r') && ptr.nnext_is('\n') {
+        if ptr.at('\n') || ptr.at_str("\r\n") {
             return;
         }
         if ptr.bump().is_none() {

diff --git a/crates/libsyntax2/src/lexer/mod.rs b/crates/libsyntax2/src/lexer/mod.rs
@@ -67,7 +67,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
         _ => (),
     }
 
-    let ident_start = is_ident_start(c) && !is_string_literal_start(c, ptr.next(), ptr.nnext());
+    let ident_start = is_ident_start(c) && !is_string_literal_start(c, ptr.current(), ptr.nth(1));
     if ident_start {
         return scan_ident(c, ptr);
     }
@@ -86,7 +86,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
     match c {
         // Multi-byte tokens.
         '.' => {
-            return match (ptr.next(), ptr.nnext()) {
+            return match (ptr.current(), ptr.nth(1)) {
                 (Some('.'), Some('.')) => {
                     ptr.bump();
                     ptr.bump();
@@ -105,7 +105,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
             };
         }
         ':' => {
-            return match ptr.next() {
+            return match ptr.current() {
                 Some(':') => {
                     ptr.bump();
                     COLONCOLON
@@ -114,7 +114,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
             };
         }
         '=' => {
-            return match ptr.next() {
+            return match ptr.current() {
                 Some('=') => {
                     ptr.bump();
                     EQEQ
@@ -127,7 +127,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
             };
         }
         '!' => {
-            return match ptr.next() {
+            return match ptr.current() {
                 Some('=') => {
                     ptr.bump();
                     NEQ
@@ -136,7 +136,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
             };
         }
         '-' => {
-            return if ptr.next_is('>') {
+            return if ptr.at('>') {
                 ptr.bump();
                 THIN_ARROW
             } else {
@@ -147,14 +147,14 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
         // If the character is an ident start not followed by another single
         // quote, then this is a lifetime name:
         '\'' => {
-            return if ptr.next_is_p(is_ident_start) && !ptr.nnext_is('\'') {
+            return if ptr.at_p(is_ident_start) && !ptr.at_str("''") {
                 ptr.bump();
-                while ptr.next_is_p(is_ident_continue) {
+                while ptr.at_p(is_ident_continue) {
                     ptr.bump();
                 }
                 // lifetimes shouldn't end with a single quote
                 // if we find one, then this is an invalid character literal
-                if ptr.next_is('\'') {
+                if ptr.at('\'') {
                     ptr.bump();
                     return CHAR; // TODO: error reporting
                 }
@@ -186,7 +186,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
 }
 
 fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind {
-    let is_single_letter = match ptr.next() {
+    let is_single_letter = match ptr.current() {
         None => true,
         Some(c) if !is_ident_continue(c) => true,
         _ => false,
@@ -202,7 +202,7 @@ fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind {
 }
 
 fn scan_literal_suffix(ptr: &mut Ptr) {
-    if ptr.next_is_p(is_ident_start) {
+    if ptr.at_p(is_ident_start) {
         ptr.bump();
     }
     ptr.bump_while(is_ident_continue);

diff --git a/crates/libsyntax2/src/lexer/numbers.rs b/crates/libsyntax2/src/lexer/numbers.rs
@@ -5,7 +5,7 @@ use SyntaxKind::{self, *};
 
 pub(crate) fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind {
     if c == '0' {
-        match ptr.next().unwrap_or('\0') {
+        match ptr.current().unwrap_or('\0') {
             'b' | 'o' => {
                 ptr.bump();
                 scan_digits(ptr, false);
@@ -26,7 +26,7 @@ pub(crate) fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind {
     // might be a float, but don't be greedy if this is actually an
     // integer literal followed by field/method access or a range pattern
     // (`0..2` and `12.foo()`)
-    if ptr.next_is('.') && !(ptr.nnext_is('.') || ptr.nnext_is_p(is_ident_start)) {
+    if ptr.at('.') && !(ptr.at_str("..") || ptr.nth_is_p(1, is_ident_start)) {
         // might have stuff after the ., and if it does, it needs to start
         // with a number
         ptr.bump();
@@ -35,15 +35,15 @@ pub(crate) fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind {
         return FLOAT_NUMBER;
     }
     // it might be a float if it has an exponent
-    if ptr.next_is('e') || ptr.next_is('E') {
+    if ptr.at('e') || ptr.at('E') {
         scan_float_exponent(ptr);
         return FLOAT_NUMBER;
     }
     INT_NUMBER
 }
 
 fn scan_digits(ptr: &mut Ptr, allow_hex: bool) {
-    while let Some(c) = ptr.next() {
+    while let Some(c) = ptr.current() {
         match c {
             '_' | '0'...'9' => {
                 ptr.bump();
@@ -57,9 +57,9 @@ fn scan_digits(ptr: &mut Ptr, allow_hex: bool) {
 }
 
 fn scan_float_exponent(ptr: &mut Ptr) {
-    if ptr.next_is('e') || ptr.next_is('E') {
+    if ptr.at('e') || ptr.at('E') {
         ptr.bump();
-        if ptr.next_is('-') || ptr.next_is('+') {
+        if ptr.at('-') || ptr.at('+') {
             ptr.bump();
         }
         scan_digits(ptr, false);

diff --git a/crates/libsyntax2/src/lexer/ptr.rs b/crates/libsyntax2/src/lexer/ptr.rs
@@ -2,58 +2,70 @@ use TextUnit;
 
 use std::str::Chars;
 
+/// A simple view into the characters of a string.
 pub(crate) struct Ptr<'s> {
     text: &'s str,
     len: TextUnit,
 }
 
 impl<'s> Ptr<'s> {
+    /// Creates a new `Ptr` from a string.
     pub fn new(text: &'s str) -> Ptr<'s> {
         Ptr {
             text,
             len: 0.into(),
         }
     }
 
+    /// Gets the length of the remaining string.
     pub fn into_len(self) -> TextUnit {
         self.len
     }
 
-    pub fn next(&self) -> Option<char> {
+    /// Gets the current character, if one exists.
+    pub fn current(&self) -> Option<char> {
         self.chars().next()
     }
 
-    pub fn nnext(&self) -> Option<char> {
-        let mut chars = self.chars();
-        chars.next()?;
-        chars.next()
+    /// Gets the nth character from the current.
+    /// For example, 0 will return the current token, 1 will return the next, etc.
+    pub fn nth(&self, n: u32) -> Option<char> {
+        let mut chars = self.chars().peekable();
+        chars.by_ref().skip(n as usize).next()
     }
 
-    pub fn next_is(&self, c: char) -> bool {
-        self.next() == Some(c)
+    /// Checks whether the current character is `c`.
+    pub fn at(&self, c: char) -> bool {
+        self.current() == Some(c)
     }
 
-    pub fn nnext_is(&self, c: char) -> bool {
-        self.nnext() == Some(c)
+    /// Checks whether the next characters match `s`.
+    pub fn at_str(&self, s: &str) -> bool {
+        let chars = self.chars();
+        chars.as_str().starts_with(s)
     }
 
-    pub fn next_is_p<P: Fn(char) -> bool>(&self, p: P) -> bool {
-        self.next().map(p) == Some(true)
+    /// Checks whether the current character satisfies the predicate `p`.
+    pub fn at_p<P: Fn(char) -> bool>(&self, p: P) -> bool {
+        self.current().map(p) == Some(true)
     }
 
-    pub fn nnext_is_p<P: Fn(char) -> bool>(&self, p: P) -> bool {
-        self.nnext().map(p) == Some(true)
+    /// Checks whether the nth character satisfies the predicate `p`.
+    pub fn nth_is_p<P: Fn(char) -> bool>(&self, n: u32, p: P) -> bool {
+        self.nth(n).map(p) == Some(true)
     }
 
+    /// Moves to the next character.
     pub fn bump(&mut self) -> Option<char> {
         let ch = self.chars().next()?;
         self.len += TextUnit::of_char(ch);
         Some(ch)
     }
 
+    /// Moves to the next character as long as `pred` is satisfied.
     pub fn bump_while<F: Fn(char) -> bool>(&mut self, pred: F) {
         loop {
-            match self.next() {
+            match self.current() {
                 Some(c) if pred(c) => {
                     self.bump();
                 }
@@ -62,13 +74,93 @@ impl<'s> Ptr<'s> {
         }
     }
 
+    /// Returns the text up to the current point.
     pub fn current_token_text(&self) -> &str {
         let len: u32 = self.len.into();
         &self.text[..len as usize]
     }
 
+    /// Returns an iterator over the remaining characters.
     fn chars(&self) -> Chars {
         let len: u32 = self.len.into();
         self.text[len as usize..].chars()
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_current() {
+        let ptr = Ptr::new("test");
+        assert_eq!(ptr.current(), Some('t'));
+    }
+
+    #[test]
+    fn test_nth() {
+        let ptr = Ptr::new("test");
+        assert_eq!(ptr.nth(0), Some('t'));
+        assert_eq!(ptr.nth(1), Some('e'));
+        assert_eq!(ptr.nth(2), Some('s'));
+        assert_eq!(ptr.nth(3), Some('t'));
+        assert_eq!(ptr.nth(4), None);
+    }
+
+    #[test]
+    fn test_at() {
+        let ptr = Ptr::new("test");
+        assert!(ptr.at('t'));
+        assert!(!ptr.at('a'));
+    }
+
+    #[test]
+    fn test_at_str() {
+        let ptr = Ptr::new("test");
+        assert!(ptr.at_str("t"));
+        assert!(ptr.at_str("te"));
+        assert!(ptr.at_str("test"));
+        assert!(!ptr.at_str("tests"));
+        assert!(!ptr.at_str("rust"));
+    }
+
+    #[test]
+    fn test_at_p() {
+        let ptr = Ptr::new("test");
+        assert!(ptr.at_p(|c| c == 't'));
+        assert!(!ptr.at_p(|c| c == 'e'));
+    }
+
+    #[test]
+    fn test_nth_is_p() {
+        let ptr = Ptr::new("test");
+        assert!(ptr.nth_is_p(0,|c| c == 't'));
+        assert!(!ptr.nth_is_p(1,|c| c == 't'));
+        assert!(ptr.nth_is_p(3,|c| c == 't'));
+        assert!(!ptr.nth_is_p(150,|c| c == 't'));
+    }
+
+    #[test]
+    fn test_bump() {
+        let mut ptr = Ptr::new("test");
+        assert_eq!(ptr.current(), Some('t'));
+        ptr.bump();
+        assert_eq!(ptr.current(), Some('e'));
+        ptr.bump();
+        assert_eq!(ptr.current(), Some('s'));
+        ptr.bump();
+        assert_eq!(ptr.current(), Some('t'));
+        ptr.bump();
+        assert_eq!(ptr.current(), None);
+        ptr.bump();
+        assert_eq!(ptr.current(), None);
+    }
+
+    #[test]
+    fn test_bump_while() {
+        let mut ptr = Ptr::new("test");
+        assert_eq!(ptr.current(), Some('t'));
+        ptr.bump_while(|c| c != 's');
+        assert_eq!(ptr.current(), Some('s'));
+    }
+}