Preserve markdown indentation (#109)

If a chunk includes newlines, preserve any indentation inside the chunk even when trimming
benbrandt · Mar 8, 2024 · 28e19c0 · 28e19c0
1 parent 9876af1
commit 28e19c0
Show file tree

Hide file tree

Showing 29 changed files with 3,809 additions and 3,356 deletions.
diff --git a/src/lib.rs b/src/lib.rs
@@ -197,6 +197,7 @@ impl ChunkCapacity for RangeToInclusive<usize> {
 }
 
 /// How a particular semantic level relates to surrounding text elements.
+#[allow(dead_code)]
 #[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
 enum SemanticSplitPosition {
     /// The semantic level should be included in the previous chunk.
@@ -210,6 +211,11 @@ enum SemanticSplitPosition {
 /// Information required by generic Semantic Levels
 trait Level: fmt::Debug {
     fn split_position(&self) -> SemanticSplitPosition;
+
+    /// Whether or not when splitting ranges, whitespace should be included as previous.
+    fn treat_whitespace_as_previous(&self) -> bool {
+        false
+    }
 }
 
 /// Implementation that dictates the semantic split points available.
@@ -268,6 +274,18 @@ trait SemanticSplit {
         text: &'text str,
         semantic_level: Self::Level,
     ) -> impl Iterator<Item = (usize, &'text str)> + 'splitter;
+
+    /// Trim the str and adjust the offset if necessary.
+    /// This is the default behavior, but custom semantic levels may need different behavior.
+    fn trim_chunk<'splitter, 'text: 'splitter>(
+        &'splitter self,
+        offset: usize,
+        chunk: &'text str,
+    ) -> (usize, &'text str) {
+        // Figure out how many bytes we lose trimming the beginning
+        let diff = chunk.len() - chunk.trim_start().len();
+        (offset + diff, chunk.trim())
+    }
 }
 
 /// Returns chunks of text with their byte offsets as an iterator.
@@ -314,9 +332,7 @@ where
     /// If trim chunks is on, trim the str and adjust the offset
     fn trim_chunk(&self, offset: usize, chunk: &'text str) -> (usize, &'text str) {
         if self.trim_chunks {
-            // Figure out how many bytes we lose trimming the beginning
-            let diff = chunk.len() - chunk.trim_start().len();
-            (offset + diff, chunk.trim())
+            self.semantic_split.trim_chunk(offset, chunk)
         } else {
             (offset, chunk)
         }
@@ -429,13 +445,7 @@ where
         let chunk = self.text.get(start..self.cursor)?;
 
         // Trim whitespace if user requested it
-        Some(if self.trim_chunks {
-            // Figure out how many bytes we lose trimming the beginning
-            let offset = chunk.len() - chunk.trim_start().len();
-            (start + offset, chunk.trim())
-        } else {
-            (start, chunk)
-        })
+        Some(self.trim_chunk(start, chunk))
     }
 
     /// Find the ideal next sections, breaking it up until we find the largest chunk.
@@ -551,6 +561,15 @@ fn split_str_by_separator<L: Level>(
                                 let prev_section = text
                                     .get(cursor..range.start)
                                     .expect("invalid character sequence");
+                                if prev_section.trim().is_empty()
+                                    && level.treat_whitespace_as_previous()
+                                {
+                                    let section = text
+                                        .get(cursor..range.end)
+                                        .expect("invalid character sequence");
+                                    cursor = range.end;
+                                    return Some(Either::Left(once((offset, section))));
+                                }
                                 let separator = text
                                     .get(range.start..range.end)
                                     .expect("invalid character sequence");

diff --git a/src/unstable_markdown.rs b/src/unstable_markdown.rs
@@ -105,7 +105,7 @@ where
     /// let text = "Some text\n\nfrom a\ndocument";
     /// let chunks = splitter.chunks(text, 10).collect::<Vec<_>>();
     ///
-    /// assert_eq!(vec!["Some text\n", "\n", "from a\n", "document"], chunks);
+    /// assert_eq!(vec!["Some text\n", "\nfrom a\n", "document"], chunks);
     /// ```
     pub fn chunks<'splitter, 'text: 'splitter>(
         &'splitter self,
@@ -127,7 +127,7 @@ where
     /// let text = "Some text\n\nfrom a\ndocument";
     /// let chunks = splitter.chunk_indices(text, 10).collect::<Vec<_>>();
     ///
-    /// assert_eq!(vec![(0, "Some text\n"), (10, "\n"), (11, "from a\n"), (18, "document")], chunks);
+    /// assert_eq!(vec![(0, "Some text\n"), (10, "\nfrom a\n"), (18, "document")], chunks);
     pub fn chunk_indices<'splitter, 'text: 'splitter>(
         &'splitter self,
         text: &'text str,
@@ -215,6 +215,24 @@ impl Level for SemanticLevel {
             SemanticLevel::Heading(_) => SemanticSplitPosition::Next,
         }
     }
+
+    fn treat_whitespace_as_previous(&self) -> bool {
+        match self {
+            SemanticLevel::Char
+            | SemanticLevel::GraphemeCluster
+            | SemanticLevel::Word
+            | SemanticLevel::Sentence
+            | SemanticLevel::SoftBreak
+            | SemanticLevel::Text
+            | SemanticLevel::InlineElement(_)
+            | SemanticLevel::Rule
+            | SemanticLevel::Heading(_)
+            | SemanticLevel::Metadata => false,
+            SemanticLevel::Block
+            | SemanticLevel::ContainerBlock(_)
+            | SemanticLevel::MetaContainer => true,
+        }
+    }
 }
 
 /// Captures information about markdown structure for a given text, and their
@@ -225,6 +243,8 @@ struct Markdown {
     ranges: Vec<(SemanticLevel, Range<usize>)>,
 }
 
+const NEWLINES: [char; 2] = ['\n', '\r'];
+
 impl SemanticSplit for Markdown {
     type Level = SemanticLevel;
 
@@ -337,6 +357,21 @@ impl SemanticSplit for Markdown {
             .map(move |(i, str)| (offset + i, str)),
         }
     }
+
+    fn trim_chunk<'splitter, 'text: 'splitter>(
+        &'splitter self,
+        offset: usize,
+        chunk: &'text str,
+    ) -> (usize, &'text str) {
+        // Preserve indentation if we have newlines inside the element
+        if chunk.trim().contains(NEWLINES) {
+            let diff = chunk.len() - chunk.trim_start_matches(NEWLINES).len();
+            (offset + diff, chunk.trim_start_matches(NEWLINES).trim_end())
+        } else {
+            let diff = chunk.len() - chunk.trim_start().len();
+            (offset + diff, chunk.trim())
+        }
+    }
 }
 
 #[cfg(test)]

diff --git a/tests/markdown.rs b/tests/markdown.rs
@@ -47,10 +47,7 @@ fn fallsback_to_normal_text_split_if_no_markdown_content() {
     let chunk_size = 10;
     let chunks = splitter.chunks(text, chunk_size).collect::<Vec<_>>();
 
-    assert_eq!(
-        ["Some text\n", "\n", "from a\n", "document"].to_vec(),
-        chunks
-    );
+    assert_eq!(["Some text\n", "\nfrom a\n", "document"].to_vec(), chunks);
 }
 
 #[cfg(feature = "markdown")]
@@ -110,3 +107,40 @@ fn subheadings_grouped_with_top_header() {
         chunks
     );
 }
+
+#[cfg(feature = "markdown")]
+#[test]
+fn trimming_doesnt_trim_block_level_indentation_if_multiple_items() {
+    let splitter = MarkdownSplitter::default().with_trim_chunks(true);
+    let text = "* Really long list item that is too big to fit\n\n  * Some Indented Text\n\n  * More Indented Text\n\n";
+    let chunk_size = 48;
+    let chunks = splitter.chunks(text, chunk_size).collect::<Vec<_>>();
+
+    assert_eq!(
+        [
+            "* Really long list item that is too big to fit",
+            "  * Some Indented Text\n\n  * More Indented Text"
+        ]
+        .to_vec(),
+        chunks
+    );
+}
+
+#[cfg(feature = "markdown")]
+#[test]
+fn trimming_does_trim_block_level_indentation_if_only_one_item() {
+    let splitter = MarkdownSplitter::default().with_trim_chunks(true);
+    let text = "1. Really long list item\n\n  1. Some Indented Text\n\n  2. More Indented Text\n\n";
+    let chunk_size = 30;
+    let chunks = splitter.chunks(text, chunk_size).collect::<Vec<_>>();
+
+    assert_eq!(
+        [
+            "1. Really long list item",
+            "1. Some Indented Text",
+            "2. More Indented Text"
+        ]
+        .to_vec(),
+        chunks
+    );
+}