Skip to content

Commit

Permalink
Markdown parser now properly parses lists
Browse files Browse the repository at this point in the history
  • Loading branch information
elijah-potter committed Jan 30, 2024
1 parent 83d7bf4 commit 1077ceb
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 32 deletions.
5 changes: 4 additions & 1 deletion harper-core/dictionary.aff
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,10 @@ SFX L Y 1
SFX L 0 ment .

SFX O Y 1
SFX O 0 ful .
SFX O 0 ful .

SFX Q Y 1
SFX Q 0 ally .

REP 90
REP a ei
Expand Down
3 changes: 2 additions & 1 deletion harper-core/dictionary.dict
Original file line number Diff line number Diff line change
Expand Up @@ -11632,7 +11632,7 @@ algebra/SM
algebraic
algebraically
algorithm/SM
algorithmic
algorithmic/Q
alias/GMDS
alibi/GMDS
alien/BGMDS
Expand Down Expand Up @@ -49591,3 +49591,4 @@ raytracer
viewport
backend
frontend
automata
41 changes: 17 additions & 24 deletions harper-core/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,8 @@ impl Document {
/// Iterate over the locations of the sentence terminators in the document.
fn sentence_terminators(&self) -> impl Iterator<Item = usize> + '_ {
self.tokens.iter().enumerate().filter_map(|(index, token)| {
if let Token {
kind: TokenKind::Punctuation(punct),
..
} = token
{
if is_sentence_terminator(punct) {
return Some(index);
}
if is_sentence_terminator(&token.kind) {
return Some(index);
}
None
})
Expand All @@ -192,14 +186,8 @@ impl Document {
.enumerate()
.rev()
.find_map(|(index, token)| {
if let Token {
kind: TokenKind::Punctuation(punct),
..
} = token
{
if is_sentence_terminator(punct) {
return Some(index);
}
if is_sentence_terminator(&token.kind) {
return Some(index);
}
None
})
Expand Down Expand Up @@ -287,13 +275,17 @@ impl Display for Document {
}
}

fn is_sentence_terminator(punctuation: &Punctuation) -> bool {
[
Punctuation::Period,
Punctuation::Bang,
Punctuation::Question,
]
.contains(punctuation)
fn is_sentence_terminator(token: &TokenKind) -> bool {
match token {
TokenKind::Punctuation(punct) => [
Punctuation::Period,
Punctuation::Bang,
Punctuation::Question,
]
.contains(punct),
TokenKind::Newline(_) => true,
_ => false,
}
}

#[cfg(test)]
Expand All @@ -313,7 +305,8 @@ mod tests {
let mut document = Document::new(text, Box::new(Markdown));
document.condense_contractions();

assert_eq!(document.tokens.len(), final_tok_count);
// We add one because the Markdown parser inserts a newline at end-of-input.
assert_eq!(document.tokens.len(), final_tok_count + 1);
}

#[test]
Expand Down
22 changes: 16 additions & 6 deletions harper-core/src/parsers/markdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,28 @@ impl Parser for Markdown {
// NOTE: the range spits out __byte__ indices, not char indices.
// This is why we keep track above.
for (event, range) in md_parser.into_offset_iter() {
if range.start > traversed_bytes {
traversed_chars += source_str[traversed_bytes..range.start].chars().count();
traversed_bytes = range.start;
}

match event {
pulldown_cmark::Event::HardBreak => {
tokens.push(Token {
span: Span::new_with_len(traversed_chars, 1),
kind: TokenKind::Newline(1),
});
}
pulldown_cmark::Event::Start(tag) => stack.push(tag),
pulldown_cmark::Event::End(pulldown_cmark::Tag::Paragraph)
| pulldown_cmark::Event::End(pulldown_cmark::Tag::Item) => tokens.push(Token {
span: Span::new_with_len(traversed_chars, 1),
kind: TokenKind::Newline(1),
}),
pulldown_cmark::Event::End(_) => {
stack.pop();
}
pulldown_cmark::Event::Code(code) => {
traversed_chars += source_str[traversed_bytes..range.start].chars().count();
traversed_bytes = range.start;

let chunk_len = code.chars().count();

tokens.push(Token {
Expand All @@ -41,9 +54,6 @@ impl Parser for Markdown {
});
}
pulldown_cmark::Event::Text(text) => {
traversed_chars += source_str[traversed_bytes..range.start].chars().count();
traversed_bytes = range.start;

let chunk_len = text.chars().count();

if let Some(tag) = stack.last() {
Expand Down
19 changes: 19 additions & 0 deletions harper-core/src/parsers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,25 @@ mod tests {
Word,
Space(1),
Word,
Newline(1),
],
);
}

#[test]
fn inserts_newlines() {
assert_tokens_eq_md(
"__hello__ world,\n\n[my]() friend",
&[
Word,
Space(1),
Word,
Punctuation(Punctuation::Comma),
Newline(1),
Word,
Space(1),
Word,
Newline(1),
],
);
}
Expand Down
7 changes: 7 additions & 0 deletions harper-core/src/span.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ impl Span {
Self { start, end }
}

pub fn new_with_len(start: usize, len: usize) -> Self {
Self {
start,
end: start + len,
}
}

pub fn len(&self) -> usize {
self.end - self.start
}
Expand Down

0 comments on commit 1077ceb

Please sign in to comment.