feat: now properly parses number suffixes + lints their capitalization

Automattic · May 14, 2024 · 7d65264 · 7d65264
1 parent 5f32ae6
commit 7d65264
Show file tree

Hide file tree

Showing 12 changed files with 197 additions and 13 deletions.
diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs
@@ -7,6 +7,7 @@ use crate::linting::Suggestion;
 use crate::parsers::{Markdown, Parser, PlainEnglish};
 use crate::punctuation::Punctuation;
 use crate::span::Span;
+use crate::token::NumberSuffix;
 use crate::{FatToken, Token, TokenKind, TokenStringExt};
 
 pub struct Document {
@@ -55,12 +56,24 @@ impl Document {
     fn parse(&mut self) {
         self.tokens = self.parser.parse(&self.source);
         self.condense_contractions();
+        self.condense_number_suffixes();
         self.match_quotes();
     }
 
     /// Given a list of indices, this function removes the subsequent
     /// `stretch_len - 1` elements after each index.
+    ///
+    /// Will extend token spans to include removed elements.
+    /// Assumes condensed tokens are contiguous in source text.
     fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
+        // Update spans
+        for idx in indices {
+            let end_tok = self.tokens[idx + stretch_len - 1];
+            let start_tok = &mut self.tokens[*idx];
+
+            start_tok.span.end = end_tok.span.end;
+        }
+
         // Trim
         let old = self.tokens.clone();
         self.tokens.clear();
@@ -86,7 +99,7 @@ impl Document {
                 .last()
                 .map(|v| v + stretch_len)
                 .unwrap_or(indices.len())..]
-        )
+        );
     }
 
     pub fn get_token_at_char_index(&self, char_index: usize) -> Option<Token> {
@@ -243,8 +256,34 @@ impl Document {
         }
     }
 
+    /// Searches for number suffixes and condenses them down into single tokens
+    fn condense_number_suffixes(&mut self) {
+        if self.tokens.len() < 2 {
+            return;
+        }
+
+        let mut replace_starts = Vec::new();
+
+        for idx in 0..self.tokens.len() - 1 {
+            let b = self.tokens[idx + 1];
+            let a = self.tokens[idx];
+
+            // TODO: Allow spaces between `a` and `b`
+
+            if let (TokenKind::Number(..), TokenKind::Word) = (a.kind, b.kind) {
+                if let Some(found_suffix) = NumberSuffix::from_chars(self.get_span_content(b.span))
+                {
+                    *self.tokens[idx].kind.as_mut_number().unwrap().1 = Some(found_suffix);
+                    replace_starts.push(idx);
+                }
+            }
+        }
+
+        self.condense_indices(&replace_starts, 2);
+    }
+
     /// Searches for contractions and condenses them down into single
-    /// self.tokens
+    /// tokens.
     fn condense_contractions(&mut self) {
         if self.tokens.len() < 3 {
             return;
@@ -444,4 +483,20 @@ mod tests {
             })
         )
     }
+
+    #[test]
+    fn condenses_number_suffixes() {
+        fn assert_token_count(source: &str, count: usize) {
+            let document = Document::new_plain_english(source);
+            assert_eq!(document.tokens.len(), count);
+        }
+
+        assert_token_count("1st", 1);
+        assert_token_count("This is the 2nd test", 9);
+        assert_token_count("This is the 3rd test", 9);
+        assert_token_count(
+            "It works even with weird capitalization like this: 600nD",
+            18
+        );
+    }
 }
diff --git a/harper-core/src/lexing/mod.rs b/harper-core/src/lexing/mod.rs
@@ -74,7 +74,7 @@ pub fn lex_number(source: &[char]) -> Option<FoundToken> {
 
         if let Ok(n) = s.parse::<f64>() {
             return Some(FoundToken {
-                token: TokenKind::Number(n),
+                token: TokenKind::Number(n, None),
                 next_index: end + 1
             });
         }

diff --git a/harper-core/src/linting/lint_group.rs b/harper-core/src/linting/lint_group.rs
@@ -4,6 +4,7 @@ use serde::{Deserialize, Serialize};
 use super::an_a::AnA;
 use super::long_sentences::LongSentences;
 use super::matcher::Matcher;
+use super::number_suffix_capitalization::NumberSuffixCapitalization;
 use super::repeated_words::RepeatedWords;
 use super::sentence_capitalization::SentenceCapitalization;
 use super::spaces::Spaces;
@@ -97,7 +98,8 @@ create_lint_group_config!(
     LongSentences => true,
     RepeatedWords => true,
     Spaces => true,
-    Matcher => true
+    Matcher => true,
+    NumberSuffixCapitalization => true
 );
 
 impl<T: Dictionary + Default> Default for LintGroup<T> {

diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs
@@ -3,6 +3,7 @@ mod lint;
 mod lint_group;
 mod long_sentences;
 mod matcher;
+mod number_suffix_capitalization;
 mod repeated_words;
 mod sentence_capitalization;
 mod spaces;

diff --git a/harper-core/src/linting/number_suffix_capitalization.rs b/harper-core/src/linting/number_suffix_capitalization.rs
@@ -0,0 +1,57 @@
+use super::{Lint, LintKind, Linter};
+use crate::token::TokenStringExt;
+use crate::{Document, Span, Suggestion, TokenKind};
+
+/// Detect and warn that the sentence is too long.
+#[derive(Debug, Clone, Copy, Default)]
+pub struct NumberSuffixCapitalization;
+
+impl Linter for NumberSuffixCapitalization {
+    fn lint(&mut self, document: &Document) -> Vec<Lint> {
+        let mut output = Vec::new();
+
+        for number_tok in document.iter_numbers() {
+            if let TokenKind::Number(_, None) = number_tok.kind {
+                continue;
+            }
+
+            let suffix_span = Span::new_with_len(number_tok.span.end, 2).pulled_by(2);
+            let chars = document.get_span_content(suffix_span);
+
+            if chars.iter().any(|c| !c.is_lowercase()) {
+                output.push(Lint {
+                    span: suffix_span,
+                    lint_kind: LintKind::Capitalization,
+                    message: "This suffix should be lowercase".to_string(),
+                    suggestions: vec![Suggestion::ReplaceWith(
+                        chars.iter().map(|c| c.to_ascii_lowercase()).collect()
+                    )],
+                    ..Default::default()
+                })
+            }
+        }
+
+        output
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::NumberSuffixCapitalization;
+    use crate::linting::tests::assert_lint_count;
+
+    #[test]
+    fn detects_uppercase_suffix() {
+        assert_lint_count("2ND", NumberSuffixCapitalization, 1);
+    }
+
+    #[test]
+    fn detects_inconsistent_suffix() {
+        assert_lint_count("2nD", NumberSuffixCapitalization, 1);
+    }
+
+    #[test]
+    fn passes_correct_case() {
+        assert_lint_count("2nd", NumberSuffixCapitalization, 0);
+    }
+}
diff --git a/harper-core/src/linting/spelled_numbers.rs b/harper-core/src/linting/spelled_numbers.rs
@@ -12,7 +12,7 @@ impl Linter for SpelledNumbers {
         let mut lints = Vec::new();
 
         for number_tok in document.iter_numbers() {
-            let number = number_tok.kind.number().unwrap();
+            let (number, _suffix) = number_tok.kind.number().unwrap();
 
             if number - number.floor() < EPSILON && number <= 100. {
                 lints.push(Lint {

diff --git a/harper-core/src/parsers/markdown.rs b/harper-core/src/parsers/markdown.rs
@@ -54,7 +54,7 @@ impl Parser for Markdown {
                     let chunk_len = code.chars().count();
 
                     tokens.push(Token {
-                        span: Span::new(traversed_chars, chunk_len),
+                        span: Span::new_with_len(traversed_chars, chunk_len),
                         kind: TokenKind::Unlintable
                     });
                 }
@@ -90,7 +90,7 @@ impl Parser for Markdown {
 
                     new_tokens
                         .iter_mut()
-                        .for_each(|token| token.span.offset(traversed_chars));
+                        .for_each(|token| token.span.push_by(traversed_chars));
 
                     tokens.append(&mut new_tokens);
                 }

diff --git a/harper-core/src/span.rs b/harper-core/src/span.rs
@@ -11,6 +11,7 @@ pub struct Span {
 
 impl Span {
     pub fn new(start: usize, end: usize) -> Self {
+        assert!(start <= end);
         Self { start, end }
     }
 
@@ -63,10 +64,39 @@ impl Span {
     }
 
     // Add an amount to both [`Self::start`] and [`Self::end`]
-    pub fn offset(&mut self, by: usize) {
+    pub fn push_by(&mut self, by: usize) {
         self.start += by;
         self.end += by;
     }
+
+    // Subtract an amount to both [`Self::start`] and [`Self::end`]
+    pub fn pull_by(&mut self, by: usize) {
+        self.start -= by;
+        self.end -= by;
+    }
+
+    // Add an amount to a copy of both [`Self::start`] and [`Self::end`]
+    pub fn pushed_by(&self, by: usize) -> Self {
+        let mut clone = *self;
+        clone.start += by;
+        clone.end += by;
+        clone
+    }
+
+    // Subtract an amount to a copy of both [`Self::start`] and [`Self::end`]
+    pub fn pulled_by(&self, by: usize) -> Self {
+        let mut clone = *self;
+        clone.start -= by;
+        clone.end -= by;
+        clone
+    }
+
+    // Add an amount a copy of both [`Self::start`] and [`Self::end`]
+    pub fn with_offset(&self, by: usize) -> Self {
+        let mut clone = *self;
+        clone.push_by(by);
+        clone
+    }
 }
 
 impl From<Range<usize>> for Span {

diff --git a/harper-core/src/token.rs b/harper-core/src/token.rs
@@ -38,7 +38,7 @@ pub enum TokenKind {
     #[default]
     Word,
     Punctuation(Punctuation),
-    Number(f64),
+    Number(f64, Option<NumberSuffix>),
     /// A sequence of " " spaces.
     Space(usize),
     /// A sequence of "\n" newlines
@@ -51,6 +51,45 @@ pub enum TokenKind {
     Unlintable
 }
 
+#[derive(Debug, Serialize, Deserialize, Default, PartialEq, PartialOrd, Clone, Copy, Is)]
+pub enum NumberSuffix {
+    #[default]
+    Th,
+    St,
+    Nd,
+    Rd
+}
+
+impl NumberSuffix {
+    /// Check the first several characters in a buffer to see if it matches a
+    /// number suffix.
+    pub fn from_chars(chars: &[char]) -> Option<Self> {
+        if chars.len() < 2 {
+            return None;
+        }
+
+        match (chars[0], chars[1]) {
+            ('t', 'h') => Some(NumberSuffix::Th),
+            ('T', 'h') => Some(NumberSuffix::Th),
+            ('t', 'H') => Some(NumberSuffix::Th),
+            ('T', 'H') => Some(NumberSuffix::Th),
+            ('s', 't') => Some(NumberSuffix::St),
+            ('S', 't') => Some(NumberSuffix::St),
+            ('s', 'T') => Some(NumberSuffix::St),
+            ('S', 'T') => Some(NumberSuffix::St),
+            ('n', 'd') => Some(NumberSuffix::Nd),
+            ('N', 'd') => Some(NumberSuffix::Nd),
+            ('n', 'D') => Some(NumberSuffix::Nd),
+            ('N', 'D') => Some(NumberSuffix::Nd),
+            ('r', 'd') => Some(NumberSuffix::Rd),
+            ('R', 'd') => Some(NumberSuffix::Rd),
+            ('r', 'D') => Some(NumberSuffix::Rd),
+            ('R', 'D') => Some(NumberSuffix::Rd),
+            _ => None
+        }
+    }
+}
+
 impl TokenKind {
     pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
         self.as_mut_punctuation()?.as_mut_quote()
@@ -170,7 +209,7 @@ impl TokenStringExt for [Token] {
 
     fn iter_number_indices(&self) -> impl Iterator<Item = usize> + '_ {
         self.iter().enumerate().filter_map(|(idx, token)| {
-            if let TokenKind::Number(_) = &token.kind {
+            if let TokenKind::Number(..) = &token.kind {
                 Some(idx)
             } else {
                 None

diff --git a/harper-ls/src/comment_parsers/go.rs b/harper-ls/src/comment_parsers/go.rs
@@ -31,7 +31,7 @@ impl Parser for Go {
 
         new_tokens
             .iter_mut()
-            .for_each(|t| t.span.offset(actual.start));
+            .for_each(|t| t.span.push_by(actual.start));
 
         new_tokens
     }

diff --git a/harper-ls/src/comment_parsers/unit.rs b/harper-ls/src/comment_parsers/unit.rs
@@ -22,7 +22,7 @@ impl Parser for Unit {
 
         new_tokens
             .iter_mut()
-            .for_each(|t| t.span.offset(actual.start));
+            .for_each(|t| t.span.push_by(actual.start));
 
         new_tokens
     }

diff --git a/harper-ls/src/tree_sitter_parser.rs b/harper-ls/src/tree_sitter_parser.rs
@@ -134,7 +134,7 @@ impl Parser for TreeSitterParser {
 
             new_tokens
                 .iter_mut()
-                .for_each(|v| v.span.offset(span.start));
+                .for_each(|v| v.span.push_by(span.start));
 
             // The comment parser will insert a newline at end-of-input.
             // If the next tree-sitter chunk is a comment, we want to remove that.