Skip to content

Commit

Permalink
feat: now properly parses number suffixes + lints their capitalization
Browse files Browse the repository at this point in the history
  • Loading branch information
Elijah Potter authored and Elijah Potter committed May 14, 2024
1 parent 5f32ae6 commit 7d65264
Show file tree
Hide file tree
Showing 12 changed files with 197 additions and 13 deletions.
59 changes: 57 additions & 2 deletions harper-core/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use crate::linting::Suggestion;
use crate::parsers::{Markdown, Parser, PlainEnglish};
use crate::punctuation::Punctuation;
use crate::span::Span;
use crate::token::NumberSuffix;
use crate::{FatToken, Token, TokenKind, TokenStringExt};

pub struct Document {
Expand Down Expand Up @@ -55,12 +56,24 @@ impl Document {
fn parse(&mut self) {
self.tokens = self.parser.parse(&self.source);
self.condense_contractions();
self.condense_number_suffixes();
self.match_quotes();
}

/// Given a list of indices, this function removes the subsequent
/// `stretch_len - 1` elements after each index.
///
/// Will extend token spans to include removed elements.
/// Assumes condensed tokens are contiguous in source text.
fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
// Update spans
for idx in indices {
let end_tok = self.tokens[idx + stretch_len - 1];
let start_tok = &mut self.tokens[*idx];

start_tok.span.end = end_tok.span.end;
}

// Trim
let old = self.tokens.clone();
self.tokens.clear();
Expand All @@ -86,7 +99,7 @@ impl Document {
.last()
.map(|v| v + stretch_len)
.unwrap_or(indices.len())..]
)
);
}

pub fn get_token_at_char_index(&self, char_index: usize) -> Option<Token> {
Expand Down Expand Up @@ -243,8 +256,34 @@ impl Document {
}
}

/// Searches for number suffixes and condenses them down into single tokens
fn condense_number_suffixes(&mut self) {
if self.tokens.len() < 2 {
return;
}

let mut replace_starts = Vec::new();

for idx in 0..self.tokens.len() - 1 {
let b = self.tokens[idx + 1];
let a = self.tokens[idx];

// TODO: Allow spaces between `a` and `b`

if let (TokenKind::Number(..), TokenKind::Word) = (a.kind, b.kind) {
if let Some(found_suffix) = NumberSuffix::from_chars(self.get_span_content(b.span))
{
*self.tokens[idx].kind.as_mut_number().unwrap().1 = Some(found_suffix);
replace_starts.push(idx);
}
}
}

self.condense_indices(&replace_starts, 2);
}

/// Searches for contractions and condenses them down into single
/// self.tokens
/// tokens.
fn condense_contractions(&mut self) {
if self.tokens.len() < 3 {
return;
Expand Down Expand Up @@ -444,4 +483,20 @@ mod tests {
})
)
}

#[test]
fn condenses_number_suffixes() {
fn assert_token_count(source: &str, count: usize) {
let document = Document::new_plain_english(source);
assert_eq!(document.tokens.len(), count);
}

assert_token_count("1st", 1);
assert_token_count("This is the 2nd test", 9);
assert_token_count("This is the 3rd test", 9);
assert_token_count(
"It works even with weird capitalization like this: 600nD",
18
);
}
}
2 changes: 1 addition & 1 deletion harper-core/src/lexing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ pub fn lex_number(source: &[char]) -> Option<FoundToken> {

if let Ok(n) = s.parse::<f64>() {
return Some(FoundToken {
token: TokenKind::Number(n),
token: TokenKind::Number(n, None),
next_index: end + 1
});
}
Expand Down
4 changes: 3 additions & 1 deletion harper-core/src/linting/lint_group.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use serde::{Deserialize, Serialize};
use super::an_a::AnA;
use super::long_sentences::LongSentences;
use super::matcher::Matcher;
use super::number_suffix_capitalization::NumberSuffixCapitalization;
use super::repeated_words::RepeatedWords;
use super::sentence_capitalization::SentenceCapitalization;
use super::spaces::Spaces;
Expand Down Expand Up @@ -97,7 +98,8 @@ create_lint_group_config!(
LongSentences => true,
RepeatedWords => true,
Spaces => true,
Matcher => true
Matcher => true,
NumberSuffixCapitalization => true
);

impl<T: Dictionary + Default> Default for LintGroup<T> {
Expand Down
1 change: 1 addition & 0 deletions harper-core/src/linting/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ mod lint;
mod lint_group;
mod long_sentences;
mod matcher;
mod number_suffix_capitalization;
mod repeated_words;
mod sentence_capitalization;
mod spaces;
Expand Down
57 changes: 57 additions & 0 deletions harper-core/src/linting/number_suffix_capitalization.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
use super::{Lint, LintKind, Linter};
use crate::token::TokenStringExt;
use crate::{Document, Span, Suggestion, TokenKind};

/// Detect and warn that the sentence is too long.
#[derive(Debug, Clone, Copy, Default)]
pub struct NumberSuffixCapitalization;

impl Linter for NumberSuffixCapitalization {
fn lint(&mut self, document: &Document) -> Vec<Lint> {
let mut output = Vec::new();

for number_tok in document.iter_numbers() {
if let TokenKind::Number(_, None) = number_tok.kind {
continue;
}

let suffix_span = Span::new_with_len(number_tok.span.end, 2).pulled_by(2);
let chars = document.get_span_content(suffix_span);

if chars.iter().any(|c| !c.is_lowercase()) {
output.push(Lint {
span: suffix_span,
lint_kind: LintKind::Capitalization,
message: "This suffix should be lowercase".to_string(),
suggestions: vec![Suggestion::ReplaceWith(
chars.iter().map(|c| c.to_ascii_lowercase()).collect()
)],
..Default::default()
})
}
}

output
}
}

#[cfg(test)]
mod tests {
use super::NumberSuffixCapitalization;
use crate::linting::tests::assert_lint_count;

#[test]
fn detects_uppercase_suffix() {
assert_lint_count("2ND", NumberSuffixCapitalization, 1);
}

#[test]
fn detects_inconsistent_suffix() {
assert_lint_count("2nD", NumberSuffixCapitalization, 1);
}

#[test]
fn passes_correct_case() {
assert_lint_count("2nd", NumberSuffixCapitalization, 0);
}
}
2 changes: 1 addition & 1 deletion harper-core/src/linting/spelled_numbers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ impl Linter for SpelledNumbers {
let mut lints = Vec::new();

for number_tok in document.iter_numbers() {
let number = number_tok.kind.number().unwrap();
let (number, _suffix) = number_tok.kind.number().unwrap();

if number - number.floor() < EPSILON && number <= 100. {
lints.push(Lint {
Expand Down
4 changes: 2 additions & 2 deletions harper-core/src/parsers/markdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ impl Parser for Markdown {
let chunk_len = code.chars().count();

tokens.push(Token {
span: Span::new(traversed_chars, chunk_len),
span: Span::new_with_len(traversed_chars, chunk_len),
kind: TokenKind::Unlintable
});
}
Expand Down Expand Up @@ -90,7 +90,7 @@ impl Parser for Markdown {

new_tokens
.iter_mut()
.for_each(|token| token.span.offset(traversed_chars));
.for_each(|token| token.span.push_by(traversed_chars));

tokens.append(&mut new_tokens);
}
Expand Down
32 changes: 31 additions & 1 deletion harper-core/src/span.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ pub struct Span {

impl Span {
pub fn new(start: usize, end: usize) -> Self {
assert!(start <= end);
Self { start, end }
}

Expand Down Expand Up @@ -63,10 +64,39 @@ impl Span {
}

// Add an amount to both [`Self::start`] and [`Self::end`]
pub fn offset(&mut self, by: usize) {
pub fn push_by(&mut self, by: usize) {
self.start += by;
self.end += by;
}

// Subtract an amount to both [`Self::start`] and [`Self::end`]
pub fn pull_by(&mut self, by: usize) {
self.start -= by;
self.end -= by;
}

// Add an amount to a copy of both [`Self::start`] and [`Self::end`]
pub fn pushed_by(&self, by: usize) -> Self {
let mut clone = *self;
clone.start += by;
clone.end += by;
clone
}

// Subtract an amount to a copy of both [`Self::start`] and [`Self::end`]
pub fn pulled_by(&self, by: usize) -> Self {
let mut clone = *self;
clone.start -= by;
clone.end -= by;
clone
}

// Add an amount a copy of both [`Self::start`] and [`Self::end`]
pub fn with_offset(&self, by: usize) -> Self {
let mut clone = *self;
clone.push_by(by);
clone
}
}

impl From<Range<usize>> for Span {
Expand Down
43 changes: 41 additions & 2 deletions harper-core/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ pub enum TokenKind {
#[default]
Word,
Punctuation(Punctuation),
Number(f64),
Number(f64, Option<NumberSuffix>),
/// A sequence of " " spaces.
Space(usize),
/// A sequence of "\n" newlines
Expand All @@ -51,6 +51,45 @@ pub enum TokenKind {
Unlintable
}

#[derive(Debug, Serialize, Deserialize, Default, PartialEq, PartialOrd, Clone, Copy, Is)]
pub enum NumberSuffix {
#[default]
Th,
St,
Nd,
Rd
}

impl NumberSuffix {
/// Check the first several characters in a buffer to see if it matches a
/// number suffix.
pub fn from_chars(chars: &[char]) -> Option<Self> {
if chars.len() < 2 {
return None;
}

match (chars[0], chars[1]) {
('t', 'h') => Some(NumberSuffix::Th),
('T', 'h') => Some(NumberSuffix::Th),
('t', 'H') => Some(NumberSuffix::Th),
('T', 'H') => Some(NumberSuffix::Th),
('s', 't') => Some(NumberSuffix::St),
('S', 't') => Some(NumberSuffix::St),
('s', 'T') => Some(NumberSuffix::St),
('S', 'T') => Some(NumberSuffix::St),
('n', 'd') => Some(NumberSuffix::Nd),
('N', 'd') => Some(NumberSuffix::Nd),
('n', 'D') => Some(NumberSuffix::Nd),
('N', 'D') => Some(NumberSuffix::Nd),
('r', 'd') => Some(NumberSuffix::Rd),
('R', 'd') => Some(NumberSuffix::Rd),
('r', 'D') => Some(NumberSuffix::Rd),
('R', 'D') => Some(NumberSuffix::Rd),
_ => None
}
}
}

impl TokenKind {
pub fn as_mut_quote(&mut self) -> Option<&mut Quote> {
self.as_mut_punctuation()?.as_mut_quote()
Expand Down Expand Up @@ -170,7 +209,7 @@ impl TokenStringExt for [Token] {

fn iter_number_indices(&self) -> impl Iterator<Item = usize> + '_ {
self.iter().enumerate().filter_map(|(idx, token)| {
if let TokenKind::Number(_) = &token.kind {
if let TokenKind::Number(..) = &token.kind {
Some(idx)
} else {
None
Expand Down
2 changes: 1 addition & 1 deletion harper-ls/src/comment_parsers/go.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ impl Parser for Go {

new_tokens
.iter_mut()
.for_each(|t| t.span.offset(actual.start));
.for_each(|t| t.span.push_by(actual.start));

new_tokens
}
Expand Down
2 changes: 1 addition & 1 deletion harper-ls/src/comment_parsers/unit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ impl Parser for Unit {

new_tokens
.iter_mut()
.for_each(|t| t.span.offset(actual.start));
.for_each(|t| t.span.push_by(actual.start));

new_tokens
}
Expand Down
2 changes: 1 addition & 1 deletion harper-ls/src/tree_sitter_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ impl Parser for TreeSitterParser {

new_tokens
.iter_mut()
.for_each(|v| v.span.offset(span.start));
.for_each(|v| v.span.push_by(span.start));

// The comment parser will insert a newline at end-of-input.
// If the next tree-sitter chunk is a comment, we want to remove that.
Expand Down

0 comments on commit 7d65264

Please sign in to comment.