diff --git a/harper-core/src/linting/an_a.rs b/harper-core/src/linting/an_a.rs new file mode 100644 index 00000000..658f84bb --- /dev/null +++ b/harper-core/src/linting/an_a.rs @@ -0,0 +1,137 @@ +use itertools::Itertools; + +use crate::{Document, Lint, LintKind, Linter, Suggestion, TokenStringExt}; + +#[derive(Debug, Default)] +pub struct AnA; + +impl Linter for AnA { + fn lint(&mut self, document: &Document) -> Vec { + let mut lints = Vec::new(); + + for (first, second) in document.iter_words().tuple_windows() { + let chars_first = document.get_span_content(first.span); + let chars_second = document.get_span_content(second.span); + + let is_a_an = match chars_first { + ['a'] => Some(true), + ['a', 'n'] => Some(false), + _ => None, + }; + + let Some(a_an) = is_a_an else { + continue; + }; + + let should_be_a_an = !starts_with_vowel(chars_second); + + if a_an != should_be_a_an { + let replacement = match a_an { + true => vec!['a', 'n'], + false => vec!['a'], + }; + + lints.push(Lint { + span: first.span, + lint_kind: LintKind::Formatting, + suggestions: vec![Suggestion::ReplaceWith(replacement)], + message: "This is not vocally correct.".to_string(), + priority: 31, + }) + } + } + + lints + } +} + +// Checks whether a provided word begins with a vowel _sound_. +// +// It was produced through trail and error. +// Matches with 99.71% and 99.77% of vowels and non-vowels in the +// Carnegie-Mellon University word -> pronunciation dataset. +fn starts_with_vowel(word: &[char]) -> bool { + if word.is_empty() { + return false; + } + + if matches!( + word, + ['e', 'u', 'p', 'h', ..] | ['e', 'u', 'g' | 'l' | 'c', ..] + ) { + return false; + } + + if matches!(word, ['u', 'k', ..]) { + return false; + } + + if matches!( + word, + ['h', 'o', 'u', 'r', ..] + | ['h', 'o', 'n', ..] + | ['u', 'n', 'i', 'n' | 'm', ..] + | ['u', 'n', 'a' | 'u', ..] + | ['h', 'e', 'r', 'b', ..] + | ['u', 'r', 'b', ..] + ) { + return true; + } + + if matches!(word, ['u', 'n' | 's', 'i' | 'a' | 'u', ..]) { + return false; + } + + if matches!(word, ['u', 'n', ..]) { + return true; + } + + if matches!(word, ['u', 'r', 'g', ..]) { + return true; + } + + if matches!( + word, + ['u', 't' | 'r' | 'n', ..] | ['e', 'u', 'r', ..] | ['u', 'w', ..] | ['u', 's', 'e', ..] + ) { + return false; + } + + if matches!(word, ['o', 'n', 'e', 'a' | 'e' | 'i' | 'u', 'l' | 'd', ..]) { + return true; + } + + if matches!(word, ['o', 'n', 'e', 'a' | 'e' | 'i' | 'u' | '-' | 's', ..]) { + return false; + } + + if matches!( + word, + ['s', 'o', 's'] + | ['r', 'z', ..] + | ['n', 'g', ..] + | ['n', 'v', ..] + | ['x'] + | ['x', 'b', 'o', 'x'] + | ['h', 'e', 'i', 'r', ..] + | ['h', 'o', 'n', 'o', 'r', ..] + ) { + return true; + } + + if matches!( + word, + ['j', 'u' | 'o', 'n', ..] | ['j', 'u', 'r' | 'n', 'a' | 'i' | 'o', ..] + ) { + return false; + } + + if matches!(word, ['x', '-' | '\'' | '.' | 'o' | 's', ..]) { + return true; + } + + matches!( + word, + ['a', ..] | ['e', ..] | ['i', ..] | ['o', ..] | ['u', ..] + ) +} diff --git a/harper-core/src/linting/lint_set.rs b/harper-core/src/linting/lint_set.rs index 8f7e3e0c..0a6a7228 100644 --- a/harper-core/src/linting/lint_set.rs +++ b/harper-core/src/linting/lint_set.rs @@ -1,6 +1,5 @@ use paste::paste; -use super::long_sentences::LongSentences; use super::matcher::Matcher; use super::repeated_words::RepeatedWords; use super::sentence_capitalization::SentenceCapitalization; @@ -9,10 +8,11 @@ use super::spell_check::SpellCheck; use super::unclosed_quotes::UnclosedQuotes; use super::wrong_quotes::WrongQuotes; use super::Linter; +use super::{an_a::AnA, long_sentences::LongSentences}; use crate::{Dictionary, Document, Lint}; pub struct LintSet { - pub(super) linters: Vec> + pub(super) linters: Vec>, } impl Linter for LintSet { @@ -32,12 +32,13 @@ impl Linter for LintSet { impl LintSet { pub fn new() -> Self { Self { - linters: Vec::new() + linters: Vec::new(), } } pub fn add_standard(&mut self, dictionary: impl Dictionary + 'static) -> &mut Self { self.add_repeated_words() + .add_an_a() .add_long_sentences() .add_unclosed_quotes() .add_sentence_capitalization() @@ -97,6 +98,7 @@ macro_rules! create_simple_builder_methods { } create_simple_builder_methods!( + AnA, SentenceCapitalization, UnclosedQuotes, WrongQuotes, diff --git a/harper-core/src/linting/matcher.rs b/harper-core/src/linting/matcher.rs index 41d0187f..c4403131 100644 --- a/harper-core/src/linting/matcher.rs +++ b/harper-core/src/linting/matcher.rs @@ -4,7 +4,7 @@ use crate::{Document, Lint, LintKind, Linter, Punctuation, Span, Suggestion, Tok #[derive(Debug, PartialEq, PartialOrd, Clone)] struct PatternToken { kind: TokenKind, - content: Option + content: Option, } impl PatternToken { @@ -12,12 +12,12 @@ impl PatternToken { if token.kind.is_word() { Self { kind: token.kind, - content: Some(document.get_span_content(token.span).into()) + content: Some(document.get_span_content(token.span).into()), } } else { Self { kind: token.kind, - content: None + content: None, } } } @@ -85,13 +85,13 @@ macro_rules! pt { struct Rule { pattern: Vec, - replace_with: Vec + replace_with: Vec, } /// A linter that uses a variety of curated pattern matches to find and fix /// common grammatical issues. pub struct Matcher { - triggers: Vec + triggers: Vec, } impl Matcher { @@ -193,24 +193,24 @@ impl Matcher { // We need to be more explicit that we are replacing with an Em dash triggers.push(Rule { pattern: vec![pt!(Hyphen), pt!(Hyphen), pt!(Hyphen)], - replace_with: vecword!("—") + replace_with: vecword!("—"), }); // Same goes for this En dash triggers.push(Rule { pattern: vec![pt!(Hyphen), pt!(Hyphen)], - replace_with: vecword!("–") + replace_with: vecword!("–"), }); // And this ellipsis triggers.push(Rule { pattern: vec![pt!(Period), pt!(Period), pt!(Period)], - replace_with: vecword!("…") + replace_with: vecword!("…"), }); triggers.push(Rule { pattern: vec![pt!("L"), pt!(Period), pt!("L"), pt!(Period), pt!("M")], - replace_with: vecword!("large language model") + replace_with: vecword!("large language model"), }); triggers.push(Rule { @@ -222,7 +222,7 @@ impl Matcher { pt!("M"), pt!(Period), ], - replace_with: vecword!("large language model") + replace_with: vecword!("large language model"), }); Self { triggers } @@ -262,7 +262,7 @@ impl Linter for Matcher { if match_tokens.len() == trigger.pattern.len() && !match_tokens.is_empty() { let span = Span::new( match_tokens.first().unwrap().span.start, - match_tokens.last().unwrap().span.end + match_tokens.last().unwrap().span.end, ); lints.push(Lint { @@ -273,7 +273,7 @@ impl Linter for Matcher { "Did you mean “{}”?", trigger.replace_with.iter().collect::() ), - priority: 15 + priority: 15, }) } } diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs index 3ac7d956..c412d22a 100644 --- a/harper-core/src/linting/mod.rs +++ b/harper-core/src/linting/mod.rs @@ -1,3 +1,4 @@ +mod an_a; mod lint; mod lint_set; mod long_sentences; diff --git a/harper-core/src/linting/spell_check.rs b/harper-core/src/linting/spell_check.rs index e0578587..47ed40d9 100644 --- a/harper-core/src/linting/spell_check.rs +++ b/harper-core/src/linting/spell_check.rs @@ -8,17 +8,17 @@ use crate::Dictionary; pub struct SpellCheck where - T: Dictionary + T: Dictionary, { dictionary: T, - word_cache: HashMap, Vec>> + word_cache: HashMap, Vec>>, } impl SpellCheck { pub fn new(dictionary: T) -> Self { Self { dictionary, - word_cache: HashMap::new() + word_cache: HashMap::new(), } } } @@ -86,7 +86,7 @@ impl Linter for SpellCheck { "Did you mean to spell “{}” this way?", document.get_span_content_str(word.span) ), - priority: 63 + priority: 63, }) }