Fix chunk regression with tokenizers (#81)

* Fix chunk regression with tokenizers Brings back the old behavior prior to binary search, where a chunk can keep consuming text as long as the new chunk equals the same chunk size (which can happen with tokenizers) * Fix for skipping mid * Try again * Hopefully faster * Tokenize if we haven't yet * Minor opt * cleanup
benbrandt · Jan 14, 2024 · d3acb5e · d3acb5e
1 parent 11fe3df
commit d3acb5e
Show file tree

Hide file tree

Showing 18 changed files with 17,735 additions and 19,411 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,10 @@
 # Changelog
 
-## v0.5.1
+## v0.6.0
+
+### Breaking Changes
+
+- Chunk behavior should now be the same as prior to v0.5.0. Once binary search finds the optimal chunk, we now check the next few sections as long as the chunk size doesn't change. This should result in the same behavior as before, but with the performance improvements of binary search.
 
 ### What's New
 

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "text-splitter"
-version = "0.5.1"
+version = "0.6.0"
 authors = ["Ben Brandt <[email protected]>"]
 edition = "2021"
 description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens (when used with large language models)."

diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock
diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "semantic-text-splitter"
-version = "0.5.1"
+version = "0.6.0"
 authors = ["Ben Brandt <[email protected]>"]
 edition = "2021"
 description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens (when used with large language models)."

diff --git a/bindings/python/tests/test_integration.py b/bindings/python/tests/test_integration.py
@@ -33,14 +33,14 @@ def test_hugging_face():
     tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
     splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=False)
     text = "123\n123"
-    assert splitter.chunks(text, 1) == ["123", "\n123"]
+    assert splitter.chunks(text, 1) == ["123\n", "123"]
 
 
 def test_hugging_face_range():
     tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
     splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=False)
     text = "123\n123"
-    assert splitter.chunks(text=text, chunk_capacity=(1, 2)) == ["123", "\n123"]
+    assert splitter.chunks(text=text, chunk_capacity=(1, 2)) == ["123\n", "123"]
 
 
 def test_hugging_face_trim():

diff --git a/src/lib.rs b/src/lib.rs
@@ -155,7 +155,7 @@ pub use characters::Characters;
 /// Result returned from a `ChunkSizer`. Includes the size of the chunk, in units
 /// determined by the sizer, as well as the max byte offset of the text that
 /// would fit within the given `ChunkCapacity`.
-#[derive(Debug, PartialEq)]
+#[derive(Copy, Clone, Debug, PartialEq)]
 pub struct ChunkSize {
     /// Whether or not the entire chunk fits within the `ChunkCapacity`
     fits: Ordering,
@@ -623,37 +623,46 @@ where
     fn next_chunk(&mut self) -> Option<(usize, &'text str)> {
         let start = self.cursor;
         let mut end = self.cursor;
-        let mut equals_found = 0;
+        let mut equals_found = false;
 
         let sections = self.next_sections()?.collect::<Vec<_>>();
+        let mut sizes = sections
+            .iter()
+            .map(|_| None)
+            .collect::<Vec<Option<ChunkSize>>>();
         let mut low = 0;
         let mut high = sections.len().saturating_sub(1);
+        let mut successful_index = None;
 
         while low <= high {
             let mid = low + (high - low) / 2;
             let (offset, str) = sections[mid];
             let text_end = offset + str.len();
             let chunk = self.text.get(start..text_end)?;
             let chunk_size = self.check_capacity(start, chunk);
+            sizes[mid] = Some(chunk_size);
 
             match chunk_size.fits {
                 Ordering::Less => {
                     // We got further than the last one, so update end
                     if text_end > end {
                         end = text_end;
+                        successful_index = Some(mid);
                     }
                 }
                 Ordering::Equal => {
                     // If we found a smaller equals use it. Or if this is the first equals we found
-                    if text_end < end || equals_found == 0 {
+                    if text_end < end || !equals_found {
                         end = text_end;
+                        successful_index = Some(mid);
                     }
-                    equals_found += 1;
+                    equals_found = true;
                 }
                 Ordering::Greater => {
                     // If we're too big on our smallest run, we must return at least one section
                     if mid == 0 && start == end {
                         end = text_end;
+                        successful_index = Some(mid);
                     }
                 }
             };
@@ -669,6 +678,37 @@ where
             }
         }
 
+        // Sometimes with tokenization, we can get a bigger chunk for the same amount of tokens.
+        if let Some((successful_index, chunk_size)) =
+            successful_index.and_then(|successful_index| {
+                Some((successful_index, sizes.get(successful_index)?.as_ref()?))
+            })
+        {
+            for (size, (offset, str)) in sizes.iter().zip(sections).skip(successful_index) {
+                let text_end = offset + str.len();
+                match size {
+                    Some(size) if size.size <= chunk_size.size => {
+                        if text_end > end {
+                            end = text_end;
+                        }
+                    }
+                    // We didn't tokenize this section yet
+                    None => {
+                        let chunk = self.text.get(start..text_end)?;
+                        let size = self.check_capacity(start, chunk);
+                        if size.size <= chunk_size.size {
+                            if text_end > end {
+                                end = text_end;
+                            }
+                        } else {
+                            break;
+                        }
+                    }
+                    _ => break,
+                }
+            }
+        }
+
         self.cursor = end;
 
         let chunk = self.text.get(start..self.cursor)?;