feat: now lints for proper use of "a" vs "an"

Automattic · Feb 29, 2024 · 7a90f1a · 7a90f1a
1 parent 1da200f
commit 7a90f1a
Show file tree

Hide file tree

Showing 5 changed files with 159 additions and 19 deletions.
diff --git a/harper-core/src/linting/an_a.rs b/harper-core/src/linting/an_a.rs
@@ -0,0 +1,137 @@
+use itertools::Itertools;
+
+use crate::{Document, Lint, LintKind, Linter, Suggestion, TokenStringExt};
+
+#[derive(Debug, Default)]
+pub struct AnA;
+
+impl Linter for AnA {
+    fn lint(&mut self, document: &Document) -> Vec<crate::Lint> {
+        let mut lints = Vec::new();
+
+        for (first, second) in document.iter_words().tuple_windows() {
+            let chars_first = document.get_span_content(first.span);
+            let chars_second = document.get_span_content(second.span);
+
+            let is_a_an = match chars_first {
+                ['a'] => Some(true),
+                ['a', 'n'] => Some(false),
+                _ => None,
+            };
+
+            let Some(a_an) = is_a_an else {
+                continue;
+            };
+
+            let should_be_a_an = !starts_with_vowel(chars_second);
+
+            if a_an != should_be_a_an {
+                let replacement = match a_an {
+                    true => vec!['a', 'n'],
+                    false => vec!['a'],
+                };
+
+                lints.push(Lint {
+                    span: first.span,
+                    lint_kind: LintKind::Formatting,
+                    suggestions: vec![Suggestion::ReplaceWith(replacement)],
+                    message: "This is not vocally correct.".to_string(),
+                    priority: 31,
+                })
+            }
+        }
+
+        lints
+    }
+}
+
+// Checks whether a provided word begins with a vowel _sound_.
+//
+// It was produced through trail and error.
+// Matches with 99.71% and 99.77% of vowels and non-vowels in the
+// Carnegie-Mellon University word -> pronunciation dataset.
+fn starts_with_vowel(word: &[char]) -> bool {
+    if word.is_empty() {
+        return false;
+    }
+
+    if matches!(
+        word,
+        ['e', 'u', 'p', 'h', ..] | ['e', 'u', 'g' | 'l' | 'c', ..]
+    ) {
+        return false;
+    }
+
+    if matches!(word, ['u', 'k', ..]) {
+        return false;
+    }
+
+    if matches!(
+        word,
+        ['h', 'o', 'u', 'r', ..]
+            | ['h', 'o', 'n', ..]
+            | ['u', 'n', 'i', 'n' | 'm', ..]
+            | ['u', 'n', 'a' | 'u', ..]
+            | ['h', 'e', 'r', 'b', ..]
+            | ['u', 'r', 'b', ..]
+    ) {
+        return true;
+    }
+
+    if matches!(word, ['u', 'n' | 's', 'i' | 'a' | 'u', ..]) {
+        return false;
+    }
+
+    if matches!(word, ['u', 'n', ..]) {
+        return true;
+    }
+
+    if matches!(word, ['u', 'r', 'g', ..]) {
+        return true;
+    }
+
+    if matches!(
+        word,
+        ['u', 't' | 'r' | 'n', ..] | ['e', 'u', 'r', ..] | ['u', 'w', ..] | ['u', 's', 'e', ..]
+    ) {
+        return false;
+    }
+
+    if matches!(word, ['o', 'n', 'e', 'a' | 'e' | 'i' | 'u', 'l' | 'd', ..]) {
+        return true;
+    }
+
+    if matches!(word, ['o', 'n', 'e', 'a' | 'e' | 'i' | 'u' | '-' | 's', ..]) {
+        return false;
+    }
+
+    if matches!(
+        word,
+        ['s', 'o', 's']
+            | ['r', 'z', ..]
+            | ['n', 'g', ..]
+            | ['n', 'v', ..]
+            | ['x']
+            | ['x', 'b', 'o', 'x']
+            | ['h', 'e', 'i', 'r', ..]
+            | ['h', 'o', 'n', 'o', 'r', ..]
+    ) {
+        return true;
+    }
+
+    if matches!(
+        word,
+        ['j', 'u' | 'o', 'n', ..] | ['j', 'u', 'r' | 'n', 'a' | 'i' | 'o', ..]
+    ) {
+        return false;
+    }
+
+    if matches!(word, ['x', '-' | '\'' | '.' | 'o' | 's', ..]) {
+        return true;
+    }
+
+    matches!(
+        word,
+        ['a', ..] | ['e', ..] | ['i', ..] | ['o', ..] | ['u', ..]
+    )
+}
diff --git a/harper-core/src/linting/lint_set.rs b/harper-core/src/linting/lint_set.rs
@@ -1,6 +1,5 @@
 use paste::paste;
 
-use super::long_sentences::LongSentences;
 use super::matcher::Matcher;
 use super::repeated_words::RepeatedWords;
 use super::sentence_capitalization::SentenceCapitalization;
@@ -9,10 +8,11 @@ use super::spell_check::SpellCheck;
 use super::unclosed_quotes::UnclosedQuotes;
 use super::wrong_quotes::WrongQuotes;
 use super::Linter;
+use super::{an_a::AnA, long_sentences::LongSentences};
 use crate::{Dictionary, Document, Lint};
 
 pub struct LintSet {
-    pub(super) linters: Vec<Box<dyn Linter>>
+    pub(super) linters: Vec<Box<dyn Linter>>,
 }
 
 impl Linter for LintSet {
@@ -32,12 +32,13 @@ impl Linter for LintSet {
 impl LintSet {
     pub fn new() -> Self {
         Self {
-            linters: Vec::new()
+            linters: Vec::new(),
         }
     }
 
     pub fn add_standard(&mut self, dictionary: impl Dictionary + 'static) -> &mut Self {
         self.add_repeated_words()
+            .add_an_a()
             .add_long_sentences()
             .add_unclosed_quotes()
             .add_sentence_capitalization()
@@ -97,6 +98,7 @@ macro_rules! create_simple_builder_methods {
 }
 
 create_simple_builder_methods!(
+    AnA,
     SentenceCapitalization,
     UnclosedQuotes,
     WrongQuotes,

diff --git a/harper-core/src/linting/matcher.rs b/harper-core/src/linting/matcher.rs
@@ -4,20 +4,20 @@ use crate::{Document, Lint, LintKind, Linter, Punctuation, Span, Suggestion, Tok
 #[derive(Debug, PartialEq, PartialOrd, Clone)]
 struct PatternToken {
     kind: TokenKind,
-    content: Option<DictWord>
+    content: Option<DictWord>,
 }
 
 impl PatternToken {
     fn from_token(token: Token, document: &Document) -> Self {
         if token.kind.is_word() {
             Self {
                 kind: token.kind,
-                content: Some(document.get_span_content(token.span).into())
+                content: Some(document.get_span_content(token.span).into()),
             }
         } else {
             Self {
                 kind: token.kind,
-                content: None
+                content: None,
             }
         }
     }
@@ -85,13 +85,13 @@ macro_rules! pt {
 
 struct Rule {
     pattern: Vec<PatternToken>,
-    replace_with: Vec<char>
+    replace_with: Vec<char>,
 }
 
 /// A linter that uses a variety of curated pattern matches to find and fix
 /// common grammatical issues.
 pub struct Matcher {
-    triggers: Vec<Rule>
+    triggers: Vec<Rule>,
 }
 
 impl Matcher {
@@ -193,24 +193,24 @@ impl Matcher {
         // We need to be more explicit that we are replacing with an Em dash
         triggers.push(Rule {
             pattern: vec![pt!(Hyphen), pt!(Hyphen), pt!(Hyphen)],
-            replace_with: vecword!("—")
+            replace_with: vecword!("—"),
         });
 
         // Same goes for this En dash
         triggers.push(Rule {
             pattern: vec![pt!(Hyphen), pt!(Hyphen)],
-            replace_with: vecword!("–")
+            replace_with: vecword!("–"),
         });
 
         // And this ellipsis
         triggers.push(Rule {
             pattern: vec![pt!(Period), pt!(Period), pt!(Period)],
-            replace_with: vecword!("…")
+            replace_with: vecword!("…"),
         });
 
         triggers.push(Rule {
             pattern: vec![pt!("L"), pt!(Period), pt!("L"), pt!(Period), pt!("M")],
-            replace_with: vecword!("large language model")
+            replace_with: vecword!("large language model"),
         });
 
         triggers.push(Rule {
@@ -222,7 +222,7 @@ impl Matcher {
                 pt!("M"),
                 pt!(Period),
             ],
-            replace_with: vecword!("large language model")
+            replace_with: vecword!("large language model"),
         });
 
         Self { triggers }
@@ -262,7 +262,7 @@ impl Linter for Matcher {
                 if match_tokens.len() == trigger.pattern.len() && !match_tokens.is_empty() {
                     let span = Span::new(
                         match_tokens.first().unwrap().span.start,
-                        match_tokens.last().unwrap().span.end
+                        match_tokens.last().unwrap().span.end,
                     );
 
                     lints.push(Lint {
@@ -273,7 +273,7 @@ impl Linter for Matcher {
                             "Did you mean “{}”?",
                             trigger.replace_with.iter().collect::<String>()
                         ),
-                        priority: 15
+                        priority: 15,
                     })
                 }
             }

diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs
@@ -1,3 +1,4 @@
+mod an_a;
 mod lint;
 mod lint_set;
 mod long_sentences;

diff --git a/harper-core/src/linting/spell_check.rs b/harper-core/src/linting/spell_check.rs
@@ -8,17 +8,17 @@ use crate::Dictionary;
 
 pub struct SpellCheck<T>
 where
-    T: Dictionary
+    T: Dictionary,
 {
     dictionary: T,
-    word_cache: HashMap<Vec<char>, Vec<Vec<char>>>
+    word_cache: HashMap<Vec<char>, Vec<Vec<char>>>,
 }
 
 impl<T: Dictionary> SpellCheck<T> {
     pub fn new(dictionary: T) -> Self {
         Self {
             dictionary,
-            word_cache: HashMap::new()
+            word_cache: HashMap::new(),
         }
     }
 }
@@ -86,7 +86,7 @@ impl<T: Dictionary> Linter for SpellCheck<T> {
                     "Did you mean to spell “{}” this way?",
                     document.get_span_content_str(word.span)
                 ),
-                priority: 63
+                priority: 63,
             })
         }