Skip to content

Commit

Permalink
Now condenses contractions
Browse files Browse the repository at this point in the history
Fixed contraction condensation
  • Loading branch information
elijah-potter committed Jan 23, 2024
1 parent 72b22e3 commit c106baf
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 57 deletions.
110 changes: 109 additions & 1 deletion harper-core/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ impl Document {
self.tokens = lex_to_end(&self.source);
}

self.condense_contractions();
// Since quote matches depend on token indices.
self.match_quotes();
}

Expand Down Expand Up @@ -84,6 +86,73 @@ impl Document {
}
}

/// Searches for contractions and condenses them down into single tokens
fn condense_contractions(&mut self) {
if self.tokens.len() < 3 {
return;
}

// Indices of the three token stretches we are going to condense.
let mut replace_starts = Vec::new();

for idx in 0..self.tokens.len() - 2 {
let a = self.tokens[idx];
let b = self.tokens[idx + 1];
let c = self.tokens[idx + 2];

if matches!(
(a.kind, b.kind, c.kind),
(
TokenKind::Word,
TokenKind::Punctuation(Punctuation::Apostrophe),
TokenKind::Word
)
) {
// Ensure there is no overlapping between replacements
let should_replace = if let Some(last_idx) = replace_starts.last() {
*last_idx < idx - 2
} else {
true
};

if should_replace {
replace_starts.push(idx);
self.tokens[idx].span.end = c.span.end;
}
}
}

// Trim
let old = self.tokens.clone();
self.tokens.clear();

// Keep first chunk.
self.tokens.extend_from_slice(
&old[0..replace_starts
.first()
.copied()
.unwrap_or(replace_starts.len())],
);

let mut iter = replace_starts.iter().peekable();

while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
self.tokens.push(old[*a_idx]);

if let Some(b_idx) = b {
self.tokens.extend_from_slice(&old[a_idx + 3..**b_idx]);
}
}

// Keep last chunk.
self.tokens.extend_from_slice(
&old[replace_starts
.last()
.map(|v| v + 3)
.unwrap_or(replace_starts.len())..],
)
}

pub fn tokens(&self) -> impl Iterator<Item = Token> + '_ {
self.tokens.iter().copied()
}
Expand Down Expand Up @@ -218,7 +287,7 @@ fn is_sentence_terminator(punctuation: &Punctuation) -> bool {
#[cfg(test)]
mod tests {
use super::Document;
use crate::Token;
use crate::{Span, Token, TokenKind};

impl Document {
fn from_raw_parts(source: Vec<char>, tokens: Vec<Token>, markdown: bool) -> Self {
Expand All @@ -230,6 +299,45 @@ mod tests {
}
}

fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
let mut document = Document::new(text, false);
dbg!(&document.tokens);
document.condense_contractions();

assert_eq!(document.tokens.len(), final_tok_count);

let mut document = Document::new(text, true);
dbg!(&document.tokens);
document.condense_contractions();

assert_eq!(document.tokens.len(), final_tok_count);
}

#[test]
fn simple_contraction() {
assert_condensed_contractions("isn't", 1);
}

#[test]
fn simple_contraction2() {
assert_condensed_contractions("wasn't", 1);
}

#[test]
fn simple_contraction3() {
assert_condensed_contractions("There's", 1);
}

#[test]
fn medium_contraction() {
assert_condensed_contractions("isn't wasn't", 3);
}

#[test]
fn medium_contraction2() {
assert_condensed_contractions("There's no way", 5);
}

#[test]
fn parses_sentences_correctly() {
let text = "There were three little pigs. They built three little homes.";
Expand Down
2 changes: 1 addition & 1 deletion harper-core/src/linting/lint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::fmt::Display;
use is_macro::Is;
use serde::{Deserialize, Serialize};

use crate::{document::Document, span::Span};
use crate::span::Span;

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct Lint {
Expand Down
93 changes: 41 additions & 52 deletions harper-core/src/parsing/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,21 @@ pub fn lex_to_end_md(source: &[char]) -> Vec<Token> {
// NOTE: the range spits out __byte__ indices, not char indices.
// This is why we keep track above.
for (event, range) in md_parser.into_offset_iter() {
match event {
pulldown_cmark::Event::Text(text) => {
traversed_chars += source_str[traversed_bytes..range.start].chars().count();
traversed_bytes = range.start;
if let pulldown_cmark::Event::Text(text) = event {
traversed_chars += source_str[traversed_bytes..range.start].chars().count();
traversed_bytes = range.start;

let mut new_tokens = lex_to_end_str(text);
dbg!(text.to_string());

new_tokens
.iter_mut()
.for_each(|token| token.span.offset(traversed_chars));
let mut new_tokens = lex_to_end_str(text);

tokens.append(&mut new_tokens);
}
_ => (),
dbg!(&new_tokens);

new_tokens
.iter_mut()
.for_each(|token| token.span.offset(traversed_chars));

tokens.append(&mut new_tokens);
}
}

Expand Down Expand Up @@ -90,9 +91,9 @@ pub fn lex_to_end(source: &[char]) -> Vec<Token> {

fn lex_token(source: &[char]) -> Option<FoundToken> {
let lexers = [
lex_punctuation,
lex_spaces,
lex_newlines,
lex_punctuation,
lex_number,
lex_word,
];
Expand All @@ -102,6 +103,7 @@ fn lex_token(source: &[char]) -> Option<FoundToken> {
return Some(f);
}
}

None
}

Expand Down Expand Up @@ -184,50 +186,37 @@ fn lex_spaces(source: &[char]) -> Option<FoundToken> {
}
}

fn lex_characters(source: &[char], cs: &str, token: TokenKind) -> Option<FoundToken> {
let sep: Vec<_> = cs.chars().collect();

if source.get(0..cs.len())? == sep {
Some(FoundToken {
token,
next_index: cs.len(),
})
} else {
None
fn lex_punctuation(source: &[char]) -> Option<FoundToken> {
if let Some(found) = lex_quote(source) {
return Some(found);
}
}

macro_rules! lex_punctuation {
($($text:literal => $res:ident),*) => {
fn lex_punctuation(source: &[char]) -> Option<FoundToken> {
if let Some(found) = lex_quote(source){
return Some(found);
}

$(
if let Some(found) = lex_characters(source, $text, TokenKind::Punctuation(Punctuation::$res)){
return Some(found);
}
)*

None
}
let c = source.first()?;

use Punctuation::*;

let punct = match c {
'’' => Apostrophe,
'\'' => Apostrophe,
'.' => Period,
'!' => Bang,
'?' => Question,
':' => Colon,
';' => Semicolon,
',' => Comma,
'-' => Hyphen,
'[' => OpenSquare,
']' => CloseSquare,
'(' => OpenRound,
')' => CloseRound,
'#' => Hash,
_ => return None,
};
}

lex_punctuation! {
"." => Period,
"!" => Bang,
"?" => Question,
":" => Colon,
";" => Semicolon,
"," => Comma,
"-" => Hyphen,
"[" => OpenSquare,
"]" => CloseSquare,
"(" => OpenRound,
")" => CloseRound,
"#" => Hash
Some(FoundToken {
next_index: 1,
token: TokenKind::Punctuation(punct),
})
}

fn lex_quote(source: &[char]) -> Option<FoundToken> {
Expand Down
22 changes: 20 additions & 2 deletions harper-core/src/parsing/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};

use crate::span::Span;

#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Default)]
pub struct Token {
pub span: Span,
pub kind: TokenKind,
Expand All @@ -28,9 +28,10 @@ pub struct FatToken {
pub kind: TokenKind,
}

#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq)]
#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Default)]
#[serde(tag = "kind", content = "value")]
pub enum TokenKind {
#[default]
Word,
Punctuation(Punctuation),
Number(f64),
Expand All @@ -48,6 +49,10 @@ impl TokenKind {
pub fn as_quote(&self) -> Option<&Quote> {
self.as_punctuation()?.as_quote()
}

pub fn is_apostrophe(&self) -> bool {
matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
}
}

#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
Expand Down Expand Up @@ -95,6 +100,8 @@ pub trait TokenStringExt {
fn iter_words(&self) -> impl Iterator<Item = &Token> + '_;
fn iter_space_indices(&self) -> impl Iterator<Item = usize> + '_;
fn iter_spaces(&self) -> impl Iterator<Item = &Token> + '_;
fn iter_apostrophe_indices(&self) -> impl Iterator<Item = usize> + '_;
fn iter_apostrophes(&self) -> impl Iterator<Item = &Token> + '_;
}

impl TokenStringExt for [Token] {
Expand Down Expand Up @@ -123,4 +130,15 @@ impl TokenStringExt for [Token] {
fn iter_spaces(&self) -> impl Iterator<Item = &Token> + '_ {
self.iter_space_indices().map(|i| &self[i])
}

fn iter_apostrophe_indices(&self) -> impl Iterator<Item = usize> + '_ {
self.iter()
.enumerate()
.filter(|(_, t)| t.kind.is_apostrophe())
.map(|(i, _)| i)
}

fn iter_apostrophes(&self) -> impl Iterator<Item = &Token> + '_ {
self.iter_apostrophe_indices().map(|i| &self[i])
}
}
2 changes: 1 addition & 1 deletion harper-core/src/span.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use serde::{Deserialize, Serialize};

/// A window in a [char].
#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq)]
pub struct Span {
pub start: usize,
pub end: usize,
Expand Down

0 comments on commit c106baf

Please sign in to comment.