UnDRY the split by separator code

I should have reflected more when the compiler told me there was dead code if the Markdown feature wasn't enabled around the `SemanticSplitPosition`. I was trying to overfit too much structure in order to reuse one loop. This was already vastly different between text and markdown, and made the code harder to reason about. Just split the two implementations, and also got rid of the `Level` trait which I also didn't feel great about because the methods there were only ever really needed for Markdown anyway.
benbrandt · Mar 23, 2024 · f898ebc · f898ebc
1 parent ecf0c78
commit f898ebc
Show file tree

Hide file tree

Showing 3 changed files with 151 additions and 126 deletions.
diff --git a/src/lib.rs b/src/lib.rs
@@ -3,12 +3,9 @@
 
 use std::{
     cmp::Ordering,
-    fmt,
-    iter::once,
     ops::{Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive},
 };
 
-use either::Either;
 use itertools::Itertools;
 
 mod characters;
@@ -198,36 +195,14 @@ impl ChunkCapacity for RangeToInclusive<usize> {
     }
 }
 
-/// How a particular semantic level relates to surrounding text elements.
-#[allow(dead_code)]
-#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
-enum SemanticSplitPosition {
-    /// The semantic level should be included in the previous chunk.
-    Prev,
-    /// The semantic level should be treated as its own chunk.
-    Own,
-    /// The semantic level should be included in the next chunk.
-    Next,
-}
-
-/// Information required by generic Semantic Levels
-trait Level: fmt::Debug {
-    fn split_position(&self) -> SemanticSplitPosition;
-
-    /// Whether or not when splitting ranges, whitespace should be included as previous.
-    fn treat_whitespace_as_previous(&self) -> bool {
-        false
-    }
-}
-
 /// Implementation that dictates the semantic split points available.
 /// For plain text, this goes from characters, to grapheme clusters, to words,
 /// to sentences, to linebreaks.
 /// For something like Markdown, this would also include things like headers,
 /// lists, and code blocks.
 trait SemanticSplit {
     /// Internal type used to represent the level of semantic splitting.
-    type Level: Copy + Level + Ord + PartialOrd + 'static;
+    type Level: Copy + Ord + PartialOrd + 'static;
 
     /// Levels that are always considered in splitting text, because they are always present.
     const PERSISTENT_LEVELS: &'static [Self::Level];
@@ -524,82 +499,6 @@ where
     }
 }
 
-/// Given a list of separator ranges, construct the sections of the text
-fn split_str_by_separator<L: Level>(
-    text: &str,
-    separator_ranges: impl Iterator<Item = (L, Range<usize>)>,
-) -> impl Iterator<Item = (usize, &str)> {
-    let mut cursor = 0;
-    let mut final_match = false;
-    separator_ranges
-        .batching(move |it| {
-            loop {
-                match it.next() {
-                    // If we've hit the end, actually return None
-                    None if final_match => return None,
-                    // First time we hit None, return the final section of the text
-                    None => {
-                        final_match = true;
-                        return text.get(cursor..).map(|t| Either::Left(once((cursor, t))));
-                    }
-                    // Return text preceding match + the match
-                    Some((level, range)) => {
-                        let offset = cursor;
-                        match level.split_position() {
-                            SemanticSplitPosition::Prev => {
-                                if range.end < cursor {
-                                    continue;
-                                }
-                                let section = text
-                                    .get(cursor..range.end)
-                                    .expect("invalid character sequence");
-                                cursor = range.end;
-                                return Some(Either::Left(once((offset, section))));
-                            }
-                            SemanticSplitPosition::Own => {
-                                if range.start < cursor {
-                                    continue;
-                                }
-                                let prev_section = text
-                                    .get(cursor..range.start)
-                                    .expect("invalid character sequence");
-                                if prev_section.trim().is_empty()
-                                    && level.treat_whitespace_as_previous()
-                                {
-                                    let section = text
-                                        .get(cursor..range.end)
-                                        .expect("invalid character sequence");
-                                    cursor = range.end;
-                                    return Some(Either::Left(once((offset, section))));
-                                }
-                                let separator = text
-                                    .get(range.start..range.end)
-                                    .expect("invalid character sequence");
-                                cursor = range.end;
-                                return Some(Either::Right(
-                                    [(offset, prev_section), (range.start, separator)].into_iter(),
-                                ));
-                            }
-                            SemanticSplitPosition::Next => {
-                                if range.start < cursor {
-                                    continue;
-                                }
-                                let prev_section = text
-                                    .get(cursor..range.start)
-                                    .expect("invalid character sequence");
-                                // Separator will be part of the next chunk
-                                cursor = range.start;
-                                return Some(Either::Left(once((offset, prev_section))));
-                            }
-                        }
-                    }
-                }
-            }
-        })
-        .flatten()
-        .filter(|(_, s)| !s.is_empty())
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;

diff --git a/src/markdown.rs b/src/markdown.rs
@@ -4,16 +4,15 @@ Semantic splitting of Markdown documents. Tries to use as many semantic units fr
 as possible, according to the Common Mark specification.
 */
 
-use std::ops::Range;
+use std::{iter::once, ops::Range};
 
 use auto_enums::auto_enum;
+use either::Either;
+use itertools::Itertools;
 use pulldown_cmark::{Event, Options, Parser, Tag};
 use unicode_segmentation::UnicodeSegmentation;
 
-use crate::{
-    split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, Level, SemanticSplit,
-    SemanticSplitPosition, TextChunks,
-};
+use crate::{Characters, ChunkCapacity, ChunkSizer, SemanticSplit, TextChunks};
 
 /// Markdown splitter. Recursively splits chunks into the largest
 /// semantic units that fit within the chunk size. Also will
@@ -169,6 +168,17 @@ impl From<pulldown_cmark::HeadingLevel> for HeadingLevel {
     }
 }
 
+/// How a particular semantic level relates to surrounding text elements.
+#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
+enum SemanticSplitPosition {
+    /// The semantic level should be included in the previous chunk.
+    Prev,
+    /// The semantic level should be treated as its own chunk.
+    Own,
+    /// The semantic level should be included in the next chunk.
+    Next,
+}
+
 /// Different semantic levels that text can be split by.
 /// Each level provides a method of splitting text into chunks of a given level
 /// as well as a fallback in case a given fallback is too large.
@@ -204,8 +214,8 @@ enum SemanticLevel {
     Metadata,
 }
 
-impl Level for SemanticLevel {
-    fn split_position(&self) -> SemanticSplitPosition {
+impl SemanticLevel {
+    fn split_position(self) -> SemanticSplitPosition {
         match self {
             SemanticLevel::Char
             | SemanticLevel::GraphemeCluster
@@ -217,13 +227,13 @@ impl Level for SemanticLevel {
             | SemanticLevel::MetaContainer
             | SemanticLevel::Rule
             | SemanticLevel::Metadata => SemanticSplitPosition::Own,
-            SemanticLevel::InlineElement(p) | SemanticLevel::ContainerBlock(p) => *p,
+            SemanticLevel::InlineElement(p) | SemanticLevel::ContainerBlock(p) => p,
             // Attach it to the next text
             SemanticLevel::Heading(_) => SemanticSplitPosition::Next,
         }
     }
 
-    fn treat_whitespace_as_previous(&self) -> bool {
+    fn treat_whitespace_as_previous(self) -> bool {
         match self {
             SemanticLevel::Char
             | SemanticLevel::GraphemeCluster
@@ -250,6 +260,85 @@ struct Markdown {
     ranges: Vec<(SemanticLevel, Range<usize>)>,
 }
 
+impl Markdown {
+    /// Given a list of separator ranges, construct the sections of the text
+    fn split_str_by_separator(
+        text: &str,
+        separator_ranges: impl Iterator<Item = (SemanticLevel, Range<usize>)>,
+    ) -> impl Iterator<Item = (usize, &str)> {
+        let mut cursor = 0;
+        let mut final_match = false;
+        separator_ranges
+            .batching(move |it| {
+                loop {
+                    match it.next() {
+                        // If we've hit the end, actually return None
+                        None if final_match => return None,
+                        // First time we hit None, return the final section of the text
+                        None => {
+                            final_match = true;
+                            return text.get(cursor..).map(|t| Either::Left(once((cursor, t))));
+                        }
+                        // Return text preceding match + the match
+                        Some((level, range)) => {
+                            let offset = cursor;
+                            match level.split_position() {
+                                SemanticSplitPosition::Prev => {
+                                    if range.end < cursor {
+                                        continue;
+                                    }
+                                    let section = text
+                                        .get(cursor..range.end)
+                                        .expect("invalid character sequence");
+                                    cursor = range.end;
+                                    return Some(Either::Left(once((offset, section))));
+                                }
+                                SemanticSplitPosition::Own => {
+                                    if range.start < cursor {
+                                        continue;
+                                    }
+                                    let prev_section = text
+                                        .get(cursor..range.start)
+                                        .expect("invalid character sequence");
+                                    if prev_section.trim().is_empty()
+                                        && level.treat_whitespace_as_previous()
+                                    {
+                                        let section = text
+                                            .get(cursor..range.end)
+                                            .expect("invalid character sequence");
+                                        cursor = range.end;
+                                        return Some(Either::Left(once((offset, section))));
+                                    }
+                                    let separator = text
+                                        .get(range.start..range.end)
+                                        .expect("invalid character sequence");
+                                    cursor = range.end;
+                                    return Some(Either::Right(
+                                        [(offset, prev_section), (range.start, separator)]
+                                            .into_iter(),
+                                    ));
+                                }
+                                SemanticSplitPosition::Next => {
+                                    if range.start < cursor {
+                                        continue;
+                                    }
+                                    let prev_section = text
+                                        .get(cursor..range.start)
+                                        .expect("invalid character sequence");
+                                    // Separator will be part of the next chunk
+                                    cursor = range.start;
+                                    return Some(Either::Left(once((offset, prev_section))));
+                                }
+                            }
+                        }
+                    }
+                }
+            })
+            .flatten()
+            .filter(|(_, s)| !s.is_empty())
+    }
+}
+
 const NEWLINES: [char; 2] = ['\n', '\r'];
 
 impl SemanticSplit for Markdown {
@@ -356,7 +445,7 @@ impl SemanticSplit for Markdown {
             | SemanticLevel::MetaContainer
             | SemanticLevel::Heading(_)
             | SemanticLevel::Rule
-            | SemanticLevel::Metadata => split_str_by_separator(
+            | SemanticLevel::Metadata => Self::split_str_by_separator(
                 text,
                 self.ranges_after_offset(offset, semantic_level)
                     .map(move |(l, sep)| (*l, sep.start - offset..sep.end - offset)),

diff --git a/src/text.rs b/src/text.rs
@@ -3,17 +3,16 @@
 Semantic splitting of text documents.
 */
 
-use std::ops::Range;
+use std::{iter::once, ops::Range};
 
 use auto_enums::auto_enum;
+use either::Either;
+use itertools::Itertools;
 use once_cell::sync::Lazy;
 use regex::Regex;
 use unicode_segmentation::UnicodeSegmentation;
 
-use crate::{
-    split_str_by_separator, Characters, ChunkCapacity, ChunkSizer, Level, SemanticSplit,
-    SemanticSplitPosition, TextChunks,
-};
+use crate::{Characters, ChunkCapacity, ChunkSizer, SemanticSplit, TextChunks};
 
 /// Default plain-text splitter. Recursively splits chunks into the largest
 /// semantic units that fit within the chunk size. Also will attempt to merge
@@ -164,13 +163,6 @@ enum SemanticLevel {
     LineBreak(usize),
 }
 
-impl Level for SemanticLevel {
-    /// All of these levels should be treated as their own chunk
-    fn split_position(&self) -> SemanticSplitPosition {
-        SemanticSplitPosition::Own
-    }
-}
-
 // Lazy so that we don't have to compile them more than once
 static LINEBREAKS: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\r\n)+|\r+|\n+").unwrap());
 
@@ -182,6 +174,51 @@ struct LineBreaks {
     line_breaks: Vec<(SemanticLevel, Range<usize>)>,
 }
 
+impl LineBreaks {
+    /// Given a list of separator ranges, construct the sections of the text
+    fn split_str_by_separator(
+        text: &str,
+        separator_ranges: impl Iterator<Item = Range<usize>>,
+    ) -> impl Iterator<Item = (usize, &str)> {
+        let mut cursor = 0;
+        let mut final_match = false;
+        separator_ranges
+            .batching(move |it| {
+                loop {
+                    match it.next() {
+                        // If we've hit the end, actually return None
+                        None if final_match => return None,
+                        // First time we hit None, return the final section of the text
+                        None => {
+                            final_match = true;
+                            return text.get(cursor..).map(|t| Either::Left(once((cursor, t))));
+                        }
+                        // Return text preceding match + the match
+                        Some(range) => {
+                            if range.start < cursor {
+                                continue;
+                            }
+
+                            let offset = cursor;
+                            let prev_section = text
+                                .get(offset..range.start)
+                                .expect("invalid character sequence");
+                            let separator = text
+                                .get(range.start..range.end)
+                                .expect("invalid character sequence");
+                            cursor = range.end;
+                            return Some(Either::Right(
+                                [(offset, prev_section), (range.start, separator)].into_iter(),
+                            ));
+                        }
+                    }
+                }
+            })
+            .flatten()
+            .filter(|(_, s)| !s.is_empty())
+    }
+}
+
 impl SemanticSplit for LineBreaks {
     type Level = SemanticLevel;
 
@@ -245,10 +282,10 @@ impl SemanticSplit for LineBreaks {
             SemanticLevel::Sentence => text
                 .split_sentence_bound_indices()
                 .map(move |(i, str)| (offset + i, str)),
-            SemanticLevel::LineBreak(_) => split_str_by_separator(
+            SemanticLevel::LineBreak(_) => Self::split_str_by_separator(
                 text,
                 self.ranges_after_offset(offset, semantic_level)
-                    .map(move |(l, sep)| (*l, sep.start - offset..sep.end - offset)),
+                    .map(move |(_, sep)| sep.start - offset..sep.end - offset),
             )
             .map(move |(i, str)| (offset + i, str)),
         }