Indicate which chunk an individual range should be a part of

benbrandt · Feb 10, 2024 · acec48a · acec48a
1 parent c8a32d0
commit acec48a
Show file tree

Hide file tree

Showing 3 changed files with 117 additions and 26 deletions.
diff --git a/src/lib.rs b/src/lib.rs
@@ -195,14 +195,30 @@ impl ChunkCapacity for RangeToInclusive<usize> {
     }
 }
 
+/// How a particular semantic level relates to surrounding text elements.
+#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
+enum SemanticSplitPosition {
+    /// The semantic level should be included in the previous chunk.
+    Prev,
+    /// The semantic level should be treated as its own chunk.
+    Own,
+    /// The semantic level should be included in the next chunk.
+    Next,
+}
+
+/// Information required by generic Semantic Levels
+trait Level {
+    fn split_position(&self) -> SemanticSplitPosition;
+}
+
 /// Implementation that dictates the semantic split points available.
 /// For plain text, this goes from characters, to grapheme clusters, to words,
 /// to sentences, to linebreaks.
 /// For something like Markdown, this would also include things like headers,
 /// lists, and code blocks.
 trait SemanticSplit {
     /// Internal type used to represent the level of semantic splitting.
-    type Level: Copy + Ord + PartialOrd + 'static;
+    type Level: Copy + Level + Ord + PartialOrd + 'static;
 
     /// Levels that are always considered in splitting text, because they are always present.
     const PERSISTENT_LEVELS: &'static [Self::Level];
@@ -499,9 +515,9 @@ where
 }
 
 /// Given a list of separator ranges, construct the sections of the text
-fn split_str_by_separator(
+fn split_str_by_separator<L: Level>(
     text: &str,
-    separator_ranges: impl Iterator<Item = Range<usize>>,
+    separator_ranges: impl Iterator<Item = (L, Range<usize>)>,
 ) -> impl Iterator<Item = (usize, &str)> {
     let mut cursor = 0;
     let mut final_match = false;
@@ -515,18 +531,37 @@ fn split_str_by_separator(
                 text.get(cursor..).map(|t| Either::Left(once((cursor, t))))
             }
             // Return text preceding match + the match
-            Some(range) => {
+            Some((level, range)) => {
                 let offset = cursor;
-                let prev_section = text
-                    .get(cursor..range.start)
-                    .expect("invalid character sequence");
-                let separator = text
-                    .get(range.start..range.end)
-                    .expect("invalid character sequence");
-                cursor = range.end;
-                Some(Either::Right(
-                    [(offset, prev_section), (range.start, separator)].into_iter(),
-                ))
+                match level.split_position() {
+                    SemanticSplitPosition::Prev => {
+                        let section = text
+                            .get(cursor..range.end)
+                            .expect("invalid character sequence");
+                        cursor = range.end;
+                        Some(Either::Left(once((offset, section))))
+                    }
+                    SemanticSplitPosition::Own => {
+                        let prev_section = text
+                            .get(cursor..range.start)
+                            .expect("invalid character sequence");
+                        let separator = text
+                            .get(range.start..range.end)
+                            .expect("invalid character sequence");
+                        cursor = range.end;
+                        Some(Either::Right(
+                            [(offset, prev_section), (range.start, separator)].into_iter(),
+                        ))
+                    }
+                    SemanticSplitPosition::Next => {
+                        let prev_section = text
+                            .get(cursor..range.start)
+                            .expect("invalid character sequence");
+                        // Separator will be part of the next chunk
+                        cursor = range.start;
+                        Some(Either::Left(once((offset, prev_section))))
+                    }
+                }
             }
         })
         .flatten()

diff --git a/src/text.rs b/src/text.rs
@@ -11,7 +11,8 @@ use regex::Regex;
 use unicode_segmentation::UnicodeSegmentation;
 
 use crate::{
-    split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, SemanticSplit, TextChunks,
+    split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, Level, SemanticSplit,
+    SemanticSplitPosition, TextChunks,
 };
 
 /// Default plain-text splitter. Recursively splits chunks into the largest
@@ -163,6 +164,13 @@ enum SemanticLevel {
     LineBreak(usize),
 }
 
+impl Level for SemanticLevel {
+    /// All of these levels should be treated as their own chunk
+    fn split_position(&self) -> SemanticSplitPosition {
+        SemanticSplitPosition::Own
+    }
+}
+
 // Lazy so that we don't have to compile them more than once
 static LINEBREAKS: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\r\n)+|\r+|\n+").unwrap());
 
@@ -259,7 +267,7 @@ impl SemanticSplit for LineBreaks {
             SemanticLevel::LineBreak(_) => split_str_by_separator(
                 text,
                 self.ranges_after_offset(offset, semantic_level)
-                    .map(move |(_, sep)| sep.start - offset..sep.end - offset),
+                    .map(move |(l, sep)| (*l, sep.start - offset..sep.end - offset)),
             )
             .map(move |(i, str)| (offset + i, str)),
         }

diff --git a/src/unstable_markdown.rs b/src/unstable_markdown.rs
@@ -11,7 +11,8 @@ use pulldown_cmark::{Event, Options, Parser};
 use unicode_segmentation::UnicodeSegmentation;
 
 use crate::{
-    split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, SemanticSplit, TextChunks,
+    split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, Level, SemanticSplit,
+    SemanticSplitPosition, TextChunks,
 };
 
 /// Markdown splitter. Recursively splits chunks into the largest
@@ -159,14 +160,29 @@ enum SemanticLevel {
     /// An inline element that is within a larger element such as a paragraph, but
     /// more specific than a sentence.
     /// Falls back to [`Self::Sentence`]
-    InlineElement,
+    InlineElement(SemanticSplitPosition),
     /// Hard line break (two newlines), which signifies a new element in Markdown
     /// Falls back to [`Self::SoftBreak`]
     HardBreak,
     /// thematic break/horizontal rule
     Rule,
 }
 
+impl Level for SemanticLevel {
+    fn split_position(&self) -> SemanticSplitPosition {
+        match self {
+            SemanticLevel::Char
+            | SemanticLevel::GraphemeCluster
+            | SemanticLevel::Word
+            | SemanticLevel::Sentence
+            | SemanticLevel::SoftBreak
+            | SemanticLevel::InlineElement(_)
+            | SemanticLevel::HardBreak
+            | SemanticLevel::Rule => SemanticSplitPosition::Own,
+        }
+    }
+}
+
 /// Captures information about markdown structure for a given text, and their
 /// various semantic levels.
 #[derive(Debug)]
@@ -195,9 +211,15 @@ impl SemanticSplit for Markdown {
                 | Event::End(_)
                 | Event::Text(_)
                 | Event::Code(_)
-                | Event::Html(_)
-                | Event::FootnoteReference(_) => None,
-                Event::TaskListMarker(_) => Some((SemanticLevel::InlineElement, range)),
+                | Event::Html(_) => None,
+                Event::FootnoteReference(_) => Some((
+                    SemanticLevel::InlineElement(SemanticSplitPosition::Prev),
+                    range,
+                )),
+                Event::TaskListMarker(_) => Some((
+                    SemanticLevel::InlineElement(SemanticSplitPosition::Next),
+                    range,
+                )),
                 Event::SoftBreak => Some((SemanticLevel::SoftBreak, range)),
                 Event::HardBreak => Some((SemanticLevel::HardBreak, range)),
                 Event::Rule => Some((SemanticLevel::Rule, range)),
@@ -246,13 +268,13 @@ impl SemanticSplit for Markdown {
             SemanticLevel::Sentence => text
                 .split_sentence_bound_indices()
                 .map(move |(i, str)| (offset + i, str)),
-            SemanticLevel::InlineElement
+            SemanticLevel::InlineElement(_)
             | SemanticLevel::SoftBreak
             | SemanticLevel::HardBreak
             | SemanticLevel::Rule => split_str_by_separator(
                 text,
                 self.ranges_after_offset(offset, semantic_level)
-                    .map(move |(_, sep)| sep.start - offset..sep.end - offset),
+                    .map(move |(l, sep)| (*l, sep.start - offset..sep.end - offset)),
             )
             .map(move |(i, str)| (offset + i, str)),
         }
@@ -456,12 +478,38 @@ mod tests {
 
         assert_eq!(
             vec![
-                &(SemanticLevel::InlineElement, 2..5),
-                &(SemanticLevel::InlineElement, 24..27)
+                &(
+                    SemanticLevel::InlineElement(SemanticSplitPosition::Next),
+                    2..5
+                ),
+                &(
+                    SemanticLevel::InlineElement(SemanticSplitPosition::Next),
+                    24..27
+                )
             ],
             markdown.ranges().collect::<Vec<_>>()
         );
-        assert_eq!(SemanticLevel::InlineElement, markdown.max_level());
+        assert_eq!(
+            SemanticLevel::InlineElement(SemanticSplitPosition::Next),
+            markdown.max_level()
+        );
+    }
+
+    #[test]
+    fn test_footnote_reference() {
+        let markdown = Markdown::new("Footnote[^1]");
+
+        assert_eq!(
+            vec![&(
+                SemanticLevel::InlineElement(SemanticSplitPosition::Prev),
+                8..12
+            ),],
+            markdown.ranges().collect::<Vec<_>>()
+        );
+        assert_eq!(
+            SemanticLevel::InlineElement(SemanticSplitPosition::Prev),
+            markdown.max_level()
+        );
     }
 
     #[test]