feat!: Consolidate Markdown split levels

In hindsight, these levels were a bit to granular, which led to some strange splitting behavior. More elements are consolidated into fewer levels to allow for much more natural split points. Will be a breaking change in the output of the Markdown splitter.
benbrandt · Apr 5, 2024 · 4eabfec · 4eabfec
1 parent 17bc95a
commit 4eabfec
Show file tree

Hide file tree

Showing 42 changed files with 1,553 additions and 2,196 deletions.
diff --git a/README.md b/README.md
@@ -161,12 +161,9 @@ Markdown is parsed according to the CommonMark spec, along with some optional fe
 4. [Unicode Sentence Boundaries](https://www.unicode.org/reports/tr29/#Sentence_Boundaries)
 5. Soft line breaks (single newline) which isn't necessarily a new element in Markdown.
 6. Inline elements such as: text nodes, emphasis, strong, strikethrough, link, image, table cells, inline code, footnote references, task list markers, and inline html.
-7. Block elements suce as: paragraphs, code blocks, and footnote definitions.
-8. Container blocks such as: table rows, block quotes, list items, and HTML blocks.
-9. Meta containers such as: lists and tables.
-10. Thematic breaks or horizontal rules.
-11. Headings by level
-12. Metadata at the beginning of the document
+7. Block elements suce as: paragraphs, code blocks, footnote definitions, metadata. Also, a block quote or row/item within a table or list that can contain other "block" type elements, and a list or table that contains items.
+8. Thematic breaks or horizontal rules.
+9. Headings by level
 
 Splitting doesn't occur below the character level, otherwise you could get partial bytes of a char, which may not be a valid unicode str.
 

diff --git a/bindings/python/README.md b/bindings/python/README.md
@@ -122,12 +122,9 @@ Markdown is parsed according to the CommonMark spec, along with some optional fe
 4. [Unicode Sentence Boundaries](https://www.unicode.org/reports/tr29/#Sentence_Boundaries)
 5. Soft line breaks (single newline) which isn't necessarily a new element in Markdown.
 6. Inline elements such as: text nodes, emphasis, strong, strikethrough, link, image, table cells, inline code, footnote references, task list markers, and inline html.
-7. Block elements suce as: paragraphs, code blocks, and footnote definitions.
-8. Container blocks such as: table rows, block quotes, list items, and HTML blocks.
-9. Meta containers such as: lists and tables.
-10. Thematic breaks or horizontal rules.
-11. Headings by level
-12. Metadata at the beginning of the document
+7. Block elements suce as: paragraphs, code blocks, footnote definitions, metadata. Also, a block quote or row/item within a table or list that can contain other "block" type elements, and a list or table that contains items.
+8. Thematic breaks or horizontal rules.
+9. Headings by level
 
 Splitting doesn't occur below the character level, otherwise you could get partial bytes of a char, which may not be a valid unicode str.
 

diff --git a/bindings/python/semantic_text_splitter.pyi b/bindings/python/semantic_text_splitter.pyi
@@ -415,12 +415,9 @@ class MarkdownSplitter:
         4. [Unicode Sentence Boundaries](https://www.unicode.org/reports/tr29/#Sentence_Boundaries)
         5. Soft line breaks (single newline) which isn't necessarily a new element in Markdown.
         6. Inline elements such as: text nodes, emphasis, strong, strikethrough, link, image, table cells, inline code, footnote references, task list markers, and inline html.
-        7. Block elements suce as: paragraphs, code blocks, and footnote definitions.
-        8. Container blocks such as: table rows, block quotes, list items, and HTML blocks.
-        9. Meta containers such as: lists and tables.
-        10. Thematic breaks or horizontal rules.
-        11. Headings by level
-        12. Metadata at the beginning of the document
+        7. Block elements suce as: paragraphs, code blocks, footnote definitions, metadata. Also, a block quote or row/item within a table or list that can contain other "block" type elements, and a list or table that contains items.
+        8. Thematic breaks or horizontal rules.
+        9. Headings by level
 
         Markdown is parsed according to the Commonmark spec, along with some optional features such as GitHub Flavored Markdown.
 

diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs
@@ -737,12 +737,9 @@ impl PyMarkdownSplitter {
     4. [Unicode Sentence Boundaries](https://www.unicode.org/reports/tr29/#Sentence_Boundaries)
     5. Soft line breaks (single newline) which isn't necessarily a new element in Markdown.
     6. Inline elements such as: text nodes, emphasis, strong, strikethrough, link, image, table cells, inline code, footnote references, task list markers, and inline html.
-    7. Block elements suce as: paragraphs, code blocks, and footnote definitions.
-    8. Container blocks such as: table rows, block quotes, list items, and HTML blocks.
-    9. Meta containers such as: lists and tables.
-    10. Thematic breaks or horizontal rules.
-    11. Headings by level
-    12. Metadata at the beginning of the document
+    7. Block elements suce as: paragraphs, code blocks, footnote definitions, metadata. Also, a block quote or row/item within a table or list that can contain other "block" type elements, and a list or table that contains items.
+    8. Thematic breaks or horizontal rules.
+    9. Headings by level
 
     Markdown is parsed according to the Commonmark spec, along with some optional features such as GitHub Flavored Markdown.
 

diff --git a/src/lib.rs b/src/lib.rs
@@ -37,22 +37,45 @@ where
     fn ranges_after_offset(
         &self,
         offset: usize,
-    ) -> impl Iterator<Item = &(Level, Range<usize>)> + '_ {
+    ) -> impl Iterator<Item = (Level, Range<usize>)> + '_ {
         self.ranges
             .iter()
             .filter(move |(_, sep)| sep.start >= offset)
+            .map(|(l, r)| (*l, r.start..r.end))
     }
     /// Retrieve ranges for all sections of a given level after an offset
     fn level_ranges_after_offset(
         &self,
         offset: usize,
         level: Level,
-    ) -> impl Iterator<Item = &(Level, Range<usize>)> + '_ {
-        let first_item = self.ranges_after_offset(offset).find(|(l, _)| l == &level);
+    ) -> impl Iterator<Item = (Level, Range<usize>)> + '_ {
+        // Find the first item of this level. Allows us to skip larger items of a higher level that surround this one.
+        // Otherwise all lower levels would only return the first item of the higher level that wraps it.
+        let first_item = self
+            .ranges_after_offset(offset)
+            .position(|(l, _)| l == level)
+            .and_then(|i| {
+                self.ranges_after_offset(offset)
+                    .skip(i)
+                    .coalesce(|(a_level, a_range), (b_level, b_range)| {
+                        // If we are at the first item, if two neighboring elements have the same level and start, take the shorter one
+                        if a_level == b_level && a_range.start == b_range.start && i == 0 {
+                            Ok((b_level, b_range))
+                        } else {
+                            Err(((a_level, a_range), (b_level, b_range)))
+                        }
+                    })
+                    // Just take the first of these items
+                    .next()
+            });
+        // let first_item = self.ranges_after_offset(offset).find(|(l, _)| l == &level);
         self.ranges_after_offset(offset)
             .filter(move |(l, _)| l >= &level)
             .skip_while(move |(l, r)| {
-                first_item.is_some_and(|(_, fir)| l > &level && r.contains(&fir.start))
+                first_item.as_ref().is_some_and(|(_, fir)| {
+                    (l > &level && r.contains(&fir.start))
+                        || (l == &level && r.start == fir.start && r.end > fir.end)
+                })
             })
     }
 
@@ -63,10 +86,10 @@ where
 
         self.peristent_levels
             .iter()
+            .copied()
             .chain(existing_levels)
             .sorted()
             .dedup()
-            .copied()
     }
 
     /// Clear out ranges we have moved past so future iterations are faster