Skip to content

Commit

Permalink
Markdown stress testing (#108)
Browse files Browse the repository at this point in the history
* Fix big sorting issue

* better differentiate between different block level elements
  • Loading branch information
benbrandt authored Mar 5, 2024
1 parent e1bf7ac commit 9876af1
Show file tree
Hide file tree
Showing 32 changed files with 1,353 additions and 1,285 deletions.
15 changes: 4 additions & 11 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -230,9 +230,6 @@ trait SemanticSplit {
/// Retrieve ranges for each semantic level in the entire text
fn ranges(&self) -> impl Iterator<Item = &(Self::Level, Range<usize>)> + '_;

/// Maximum level of semantic splitting in the text
fn max_level(&self) -> Self::Level;

/// Retrieve ranges for all sections of a given level after an offset
fn ranges_after_offset(
&self,
Expand All @@ -249,23 +246,19 @@ trait SemanticSplit {

/// Return a unique, sorted list of all line break levels present before the next max level, added
/// to all of the base semantic levels, in order from smallest to largest
fn levels_in_next_max_chunk(&self, offset: usize) -> impl Iterator<Item = Self::Level> + '_ {
let max_level = self.max_level();
fn levels_in_remaining_text(&self, offset: usize) -> impl Iterator<Item = Self::Level> + '_ {
let existing_levels = self
.ranges()
// Only start taking them from the offset
.filter(|(_, sep)| sep.start >= offset)
// Stop once we hit the first of the max level
.take_while_inclusive(|(l, _)| l < &max_level)
.map(|(l, _)| l)
.copied();
.map(|(l, _)| l);

Self::PERSISTENT_LEVELS
.iter()
.copied()
.chain(existing_levels)
.sorted()
.dedup()
.copied()
}

/// Split a given text into iterator over each semantic chunk
Expand Down Expand Up @@ -451,7 +444,7 @@ where
fn next_sections(&'sizer self) -> Option<impl Iterator<Item = (usize, &'text str)> + 'sizer> {
// Next levels to try. Will stop at max level. We check only levels in the next max level
// chunk so we don't bypass it if not all levels are present in every chunk.
let mut levels = self.semantic_split.levels_in_next_max_chunk(self.cursor);
let mut levels = self.semantic_split.levels_in_remaining_text(self.cursor);
// Get starting level
let mut semantic_level = levels.next()?;
// If we aren't at the highest semantic level, stop iterating sections that go beyond the range of the next level.
Expand Down
22 changes: 1 addition & 21 deletions src/text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,6 @@ static LINEBREAKS: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\r\n)+|\r+|\n+").unw
struct LineBreaks {
/// Range of each line break and its precalculated semantic level
line_breaks: Vec<(SemanticLevel, Range<usize>)>,
/// Maximum number of linebreaks in a given text
max_level: SemanticLevel,
}

impl SemanticSplit for LineBreaks {
Expand Down Expand Up @@ -215,31 +213,14 @@ impl SemanticSplit for LineBreaks {
})
.collect::<Vec<_>>();

let max_level = *line_breaks
.iter()
.map(|(l, _)| l)
.max_by_key(|level| match level {
SemanticLevel::LineBreak(n) => n,
_ => &0,
})
.unwrap_or(&SemanticLevel::Sentence);

Self {
line_breaks,
max_level,
}
Self { line_breaks }
}

/// Retrieve ranges for all sections of a given level after an offset
fn ranges(&self) -> impl Iterator<Item = &(SemanticLevel, Range<usize>)> + '_ {
self.line_breaks.iter()
}

/// Maximum level of semantic splitting in the text
fn max_level(&self) -> SemanticLevel {
self.max_level
}

/// Split a given text into iterator over each semantic chunk
#[auto_enum(Iterator)]
fn semantic_chunks<'splitter, 'text: 'splitter>(
Expand Down Expand Up @@ -509,6 +490,5 @@ mod tests {
],
linebreaks.line_breaks
);
assert_eq!(SemanticLevel::LineBreak(3), linebreaks.max_level);
}
}
Loading

0 comments on commit 9876af1

Please sign in to comment.