From acec48a1c2503fbb03df8f87a1c530957a26ea49 Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Sat, 10 Feb 2024 22:46:59 +0100 Subject: [PATCH] Indicate which chunk an individual range should be a part of --- src/lib.rs | 63 ++++++++++++++++++++++++++++--------- src/text.rs | 12 +++++-- src/unstable_markdown.rs | 68 ++++++++++++++++++++++++++++++++++------ 3 files changed, 117 insertions(+), 26 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 5ca67d0d..abc6436d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -195,6 +195,22 @@ impl ChunkCapacity for RangeToInclusive { } } +/// How a particular semantic level relates to surrounding text elements. +#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)] +enum SemanticSplitPosition { + /// The semantic level should be included in the previous chunk. + Prev, + /// The semantic level should be treated as its own chunk. + Own, + /// The semantic level should be included in the next chunk. + Next, +} + +/// Information required by generic Semantic Levels +trait Level { + fn split_position(&self) -> SemanticSplitPosition; +} + /// Implementation that dictates the semantic split points available. /// For plain text, this goes from characters, to grapheme clusters, to words, /// to sentences, to linebreaks. @@ -202,7 +218,7 @@ impl ChunkCapacity for RangeToInclusive { /// lists, and code blocks. trait SemanticSplit { /// Internal type used to represent the level of semantic splitting. - type Level: Copy + Ord + PartialOrd + 'static; + type Level: Copy + Level + Ord + PartialOrd + 'static; /// Levels that are always considered in splitting text, because they are always present. const PERSISTENT_LEVELS: &'static [Self::Level]; @@ -499,9 +515,9 @@ where } /// Given a list of separator ranges, construct the sections of the text -fn split_str_by_separator( +fn split_str_by_separator( text: &str, - separator_ranges: impl Iterator>, + separator_ranges: impl Iterator)>, ) -> impl Iterator { let mut cursor = 0; let mut final_match = false; @@ -515,18 +531,37 @@ fn split_str_by_separator( text.get(cursor..).map(|t| Either::Left(once((cursor, t)))) } // Return text preceding match + the match - Some(range) => { + Some((level, range)) => { let offset = cursor; - let prev_section = text - .get(cursor..range.start) - .expect("invalid character sequence"); - let separator = text - .get(range.start..range.end) - .expect("invalid character sequence"); - cursor = range.end; - Some(Either::Right( - [(offset, prev_section), (range.start, separator)].into_iter(), - )) + match level.split_position() { + SemanticSplitPosition::Prev => { + let section = text + .get(cursor..range.end) + .expect("invalid character sequence"); + cursor = range.end; + Some(Either::Left(once((offset, section)))) + } + SemanticSplitPosition::Own => { + let prev_section = text + .get(cursor..range.start) + .expect("invalid character sequence"); + let separator = text + .get(range.start..range.end) + .expect("invalid character sequence"); + cursor = range.end; + Some(Either::Right( + [(offset, prev_section), (range.start, separator)].into_iter(), + )) + } + SemanticSplitPosition::Next => { + let prev_section = text + .get(cursor..range.start) + .expect("invalid character sequence"); + // Separator will be part of the next chunk + cursor = range.start; + Some(Either::Left(once((offset, prev_section)))) + } + } } }) .flatten() diff --git a/src/text.rs b/src/text.rs index 3f012f6b..8d609e5e 100644 --- a/src/text.rs +++ b/src/text.rs @@ -11,7 +11,8 @@ use regex::Regex; use unicode_segmentation::UnicodeSegmentation; use crate::{ - split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, SemanticSplit, TextChunks, + split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, Level, SemanticSplit, + SemanticSplitPosition, TextChunks, }; /// Default plain-text splitter. Recursively splits chunks into the largest @@ -163,6 +164,13 @@ enum SemanticLevel { LineBreak(usize), } +impl Level for SemanticLevel { + /// All of these levels should be treated as their own chunk + fn split_position(&self) -> SemanticSplitPosition { + SemanticSplitPosition::Own + } +} + // Lazy so that we don't have to compile them more than once static LINEBREAKS: Lazy = Lazy::new(|| Regex::new(r"(\r\n)+|\r+|\n+").unwrap()); @@ -259,7 +267,7 @@ impl SemanticSplit for LineBreaks { SemanticLevel::LineBreak(_) => split_str_by_separator( text, self.ranges_after_offset(offset, semantic_level) - .map(move |(_, sep)| sep.start - offset..sep.end - offset), + .map(move |(l, sep)| (*l, sep.start - offset..sep.end - offset)), ) .map(move |(i, str)| (offset + i, str)), } diff --git a/src/unstable_markdown.rs b/src/unstable_markdown.rs index 999fb46e..c30ac751 100644 --- a/src/unstable_markdown.rs +++ b/src/unstable_markdown.rs @@ -11,7 +11,8 @@ use pulldown_cmark::{Event, Options, Parser}; use unicode_segmentation::UnicodeSegmentation; use crate::{ - split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, SemanticSplit, TextChunks, + split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, Level, SemanticSplit, + SemanticSplitPosition, TextChunks, }; /// Markdown splitter. Recursively splits chunks into the largest @@ -159,7 +160,7 @@ enum SemanticLevel { /// An inline element that is within a larger element such as a paragraph, but /// more specific than a sentence. /// Falls back to [`Self::Sentence`] - InlineElement, + InlineElement(SemanticSplitPosition), /// Hard line break (two newlines), which signifies a new element in Markdown /// Falls back to [`Self::SoftBreak`] HardBreak, @@ -167,6 +168,21 @@ enum SemanticLevel { Rule, } +impl Level for SemanticLevel { + fn split_position(&self) -> SemanticSplitPosition { + match self { + SemanticLevel::Char + | SemanticLevel::GraphemeCluster + | SemanticLevel::Word + | SemanticLevel::Sentence + | SemanticLevel::SoftBreak + | SemanticLevel::InlineElement(_) + | SemanticLevel::HardBreak + | SemanticLevel::Rule => SemanticSplitPosition::Own, + } + } +} + /// Captures information about markdown structure for a given text, and their /// various semantic levels. #[derive(Debug)] @@ -195,9 +211,15 @@ impl SemanticSplit for Markdown { | Event::End(_) | Event::Text(_) | Event::Code(_) - | Event::Html(_) - | Event::FootnoteReference(_) => None, - Event::TaskListMarker(_) => Some((SemanticLevel::InlineElement, range)), + | Event::Html(_) => None, + Event::FootnoteReference(_) => Some(( + SemanticLevel::InlineElement(SemanticSplitPosition::Prev), + range, + )), + Event::TaskListMarker(_) => Some(( + SemanticLevel::InlineElement(SemanticSplitPosition::Next), + range, + )), Event::SoftBreak => Some((SemanticLevel::SoftBreak, range)), Event::HardBreak => Some((SemanticLevel::HardBreak, range)), Event::Rule => Some((SemanticLevel::Rule, range)), @@ -246,13 +268,13 @@ impl SemanticSplit for Markdown { SemanticLevel::Sentence => text .split_sentence_bound_indices() .map(move |(i, str)| (offset + i, str)), - SemanticLevel::InlineElement + SemanticLevel::InlineElement(_) | SemanticLevel::SoftBreak | SemanticLevel::HardBreak | SemanticLevel::Rule => split_str_by_separator( text, self.ranges_after_offset(offset, semantic_level) - .map(move |(_, sep)| sep.start - offset..sep.end - offset), + .map(move |(l, sep)| (*l, sep.start - offset..sep.end - offset)), ) .map(move |(i, str)| (offset + i, str)), } @@ -456,12 +478,38 @@ mod tests { assert_eq!( vec![ - &(SemanticLevel::InlineElement, 2..5), - &(SemanticLevel::InlineElement, 24..27) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Next), + 2..5 + ), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Next), + 24..27 + ) ], markdown.ranges().collect::>() ); - assert_eq!(SemanticLevel::InlineElement, markdown.max_level()); + assert_eq!( + SemanticLevel::InlineElement(SemanticSplitPosition::Next), + markdown.max_level() + ); + } + + #[test] + fn test_footnote_reference() { + let markdown = Markdown::new("Footnote[^1]"); + + assert_eq!( + vec![&( + SemanticLevel::InlineElement(SemanticSplitPosition::Prev), + 8..12 + ),], + markdown.ranges().collect::>() + ); + assert_eq!( + SemanticLevel::InlineElement(SemanticSplitPosition::Prev), + markdown.max_level() + ); } #[test]