Skip to content

Commit

Permalink
Indicate which chunk an individual range should be a part of
Browse files Browse the repository at this point in the history
  • Loading branch information
benbrandt committed Feb 10, 2024
1 parent c8a32d0 commit acec48a
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 26 deletions.
63 changes: 49 additions & 14 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,14 +195,30 @@ impl ChunkCapacity for RangeToInclusive<usize> {
}
}

/// How a particular semantic level relates to surrounding text elements.
#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
enum SemanticSplitPosition {
/// The semantic level should be included in the previous chunk.
Prev,
/// The semantic level should be treated as its own chunk.
Own,
/// The semantic level should be included in the next chunk.
Next,
}

/// Information required by generic Semantic Levels
trait Level {
fn split_position(&self) -> SemanticSplitPosition;
}

/// Implementation that dictates the semantic split points available.
/// For plain text, this goes from characters, to grapheme clusters, to words,
/// to sentences, to linebreaks.
/// For something like Markdown, this would also include things like headers,
/// lists, and code blocks.
trait SemanticSplit {
/// Internal type used to represent the level of semantic splitting.
type Level: Copy + Ord + PartialOrd + 'static;
type Level: Copy + Level + Ord + PartialOrd + 'static;

/// Levels that are always considered in splitting text, because they are always present.
const PERSISTENT_LEVELS: &'static [Self::Level];
Expand Down Expand Up @@ -499,9 +515,9 @@ where
}

/// Given a list of separator ranges, construct the sections of the text
fn split_str_by_separator(
fn split_str_by_separator<L: Level>(
text: &str,
separator_ranges: impl Iterator<Item = Range<usize>>,
separator_ranges: impl Iterator<Item = (L, Range<usize>)>,
) -> impl Iterator<Item = (usize, &str)> {
let mut cursor = 0;
let mut final_match = false;
Expand All @@ -515,18 +531,37 @@ fn split_str_by_separator(
text.get(cursor..).map(|t| Either::Left(once((cursor, t))))
}
// Return text preceding match + the match
Some(range) => {
Some((level, range)) => {
let offset = cursor;
let prev_section = text
.get(cursor..range.start)
.expect("invalid character sequence");
let separator = text
.get(range.start..range.end)
.expect("invalid character sequence");
cursor = range.end;
Some(Either::Right(
[(offset, prev_section), (range.start, separator)].into_iter(),
))
match level.split_position() {
SemanticSplitPosition::Prev => {
let section = text
.get(cursor..range.end)
.expect("invalid character sequence");
cursor = range.end;
Some(Either::Left(once((offset, section))))
}
SemanticSplitPosition::Own => {
let prev_section = text
.get(cursor..range.start)
.expect("invalid character sequence");
let separator = text
.get(range.start..range.end)
.expect("invalid character sequence");
cursor = range.end;
Some(Either::Right(
[(offset, prev_section), (range.start, separator)].into_iter(),
))
}
SemanticSplitPosition::Next => {
let prev_section = text
.get(cursor..range.start)
.expect("invalid character sequence");
// Separator will be part of the next chunk
cursor = range.start;
Some(Either::Left(once((offset, prev_section))))
}
}
}
})
.flatten()
Expand Down
12 changes: 10 additions & 2 deletions src/text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ use regex::Regex;
use unicode_segmentation::UnicodeSegmentation;

use crate::{
split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, SemanticSplit, TextChunks,
split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, Level, SemanticSplit,
SemanticSplitPosition, TextChunks,
};

/// Default plain-text splitter. Recursively splits chunks into the largest
Expand Down Expand Up @@ -163,6 +164,13 @@ enum SemanticLevel {
LineBreak(usize),
}

impl Level for SemanticLevel {
/// All of these levels should be treated as their own chunk
fn split_position(&self) -> SemanticSplitPosition {
SemanticSplitPosition::Own
}
}

// Lazy so that we don't have to compile them more than once
static LINEBREAKS: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\r\n)+|\r+|\n+").unwrap());

Expand Down Expand Up @@ -259,7 +267,7 @@ impl SemanticSplit for LineBreaks {
SemanticLevel::LineBreak(_) => split_str_by_separator(
text,
self.ranges_after_offset(offset, semantic_level)
.map(move |(_, sep)| sep.start - offset..sep.end - offset),
.map(move |(l, sep)| (*l, sep.start - offset..sep.end - offset)),
)
.map(move |(i, str)| (offset + i, str)),
}
Expand Down
68 changes: 58 additions & 10 deletions src/unstable_markdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ use pulldown_cmark::{Event, Options, Parser};
use unicode_segmentation::UnicodeSegmentation;

use crate::{
split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, SemanticSplit, TextChunks,
split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, Level, SemanticSplit,
SemanticSplitPosition, TextChunks,
};

/// Markdown splitter. Recursively splits chunks into the largest
Expand Down Expand Up @@ -159,14 +160,29 @@ enum SemanticLevel {
/// An inline element that is within a larger element such as a paragraph, but
/// more specific than a sentence.
/// Falls back to [`Self::Sentence`]
InlineElement,
InlineElement(SemanticSplitPosition),
/// Hard line break (two newlines), which signifies a new element in Markdown
/// Falls back to [`Self::SoftBreak`]
HardBreak,
/// thematic break/horizontal rule
Rule,
}

impl Level for SemanticLevel {
fn split_position(&self) -> SemanticSplitPosition {
match self {
SemanticLevel::Char
| SemanticLevel::GraphemeCluster
| SemanticLevel::Word
| SemanticLevel::Sentence
| SemanticLevel::SoftBreak
| SemanticLevel::InlineElement(_)
| SemanticLevel::HardBreak
| SemanticLevel::Rule => SemanticSplitPosition::Own,
}
}
}

/// Captures information about markdown structure for a given text, and their
/// various semantic levels.
#[derive(Debug)]
Expand Down Expand Up @@ -195,9 +211,15 @@ impl SemanticSplit for Markdown {
| Event::End(_)
| Event::Text(_)
| Event::Code(_)
| Event::Html(_)
| Event::FootnoteReference(_) => None,
Event::TaskListMarker(_) => Some((SemanticLevel::InlineElement, range)),
| Event::Html(_) => None,
Event::FootnoteReference(_) => Some((
SemanticLevel::InlineElement(SemanticSplitPosition::Prev),
range,
)),
Event::TaskListMarker(_) => Some((
SemanticLevel::InlineElement(SemanticSplitPosition::Next),
range,
)),
Event::SoftBreak => Some((SemanticLevel::SoftBreak, range)),
Event::HardBreak => Some((SemanticLevel::HardBreak, range)),
Event::Rule => Some((SemanticLevel::Rule, range)),
Expand Down Expand Up @@ -246,13 +268,13 @@ impl SemanticSplit for Markdown {
SemanticLevel::Sentence => text
.split_sentence_bound_indices()
.map(move |(i, str)| (offset + i, str)),
SemanticLevel::InlineElement
SemanticLevel::InlineElement(_)
| SemanticLevel::SoftBreak
| SemanticLevel::HardBreak
| SemanticLevel::Rule => split_str_by_separator(
text,
self.ranges_after_offset(offset, semantic_level)
.map(move |(_, sep)| sep.start - offset..sep.end - offset),
.map(move |(l, sep)| (*l, sep.start - offset..sep.end - offset)),
)
.map(move |(i, str)| (offset + i, str)),
}
Expand Down Expand Up @@ -456,12 +478,38 @@ mod tests {

assert_eq!(
vec![
&(SemanticLevel::InlineElement, 2..5),
&(SemanticLevel::InlineElement, 24..27)
&(
SemanticLevel::InlineElement(SemanticSplitPosition::Next),
2..5
),
&(
SemanticLevel::InlineElement(SemanticSplitPosition::Next),
24..27
)
],
markdown.ranges().collect::<Vec<_>>()
);
assert_eq!(SemanticLevel::InlineElement, markdown.max_level());
assert_eq!(
SemanticLevel::InlineElement(SemanticSplitPosition::Next),
markdown.max_level()
);
}

#[test]
fn test_footnote_reference() {
let markdown = Markdown::new("Footnote[^1]");

assert_eq!(
vec![&(
SemanticLevel::InlineElement(SemanticSplitPosition::Prev),
8..12
),],
markdown.ranges().collect::<Vec<_>>()
);
assert_eq!(
SemanticLevel::InlineElement(SemanticSplitPosition::Prev),
markdown.max_level()
);
}

#[test]
Expand Down

0 comments on commit acec48a

Please sign in to comment.