Skip to content

Commit

Permalink
Fix chunk regression with tokenizers (#81)
Browse files Browse the repository at this point in the history
* Fix chunk regression with tokenizers

Brings back the old behavior prior to binary search, where a chunk can keep consuming
text as long as the new chunk equals the same chunk size (which can happen with tokenizers)

* Fix for skipping mid

* Try again

* Hopefully faster

* Tokenize if we haven't yet

* Minor opt

* cleanup
  • Loading branch information
benbrandt authored Jan 14, 2024
1 parent 11fe3df commit d3acb5e
Show file tree
Hide file tree
Showing 18 changed files with 17,735 additions and 19,411 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Changelog

## v0.5.1
## v0.6.0

### Breaking Changes

- Chunk behavior should now be the same as prior to v0.5.0. Once binary search finds the optimal chunk, we now check the next few sections as long as the chunk size doesn't change. This should result in the same behavior as before, but with the performance improvements of binary search.

### What's New

Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "text-splitter"
version = "0.5.1"
version = "0.6.0"
authors = ["Ben Brandt <[email protected]>"]
edition = "2021"
description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens (when used with large language models)."
Expand Down
4 changes: 2 additions & 2 deletions bindings/python/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion bindings/python/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "semantic-text-splitter"
version = "0.5.1"
version = "0.6.0"
authors = ["Ben Brandt <[email protected]>"]
edition = "2021"
description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens (when used with large language models)."
Expand Down
4 changes: 2 additions & 2 deletions bindings/python/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ def test_hugging_face():
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=False)
text = "123\n123"
assert splitter.chunks(text, 1) == ["123", "\n123"]
assert splitter.chunks(text, 1) == ["123\n", "123"]


def test_hugging_face_range():
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=False)
text = "123\n123"
assert splitter.chunks(text=text, chunk_capacity=(1, 2)) == ["123", "\n123"]
assert splitter.chunks(text=text, chunk_capacity=(1, 2)) == ["123\n", "123"]


def test_hugging_face_trim():
Expand Down
48 changes: 44 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ pub use characters::Characters;
/// Result returned from a `ChunkSizer`. Includes the size of the chunk, in units
/// determined by the sizer, as well as the max byte offset of the text that
/// would fit within the given `ChunkCapacity`.
#[derive(Debug, PartialEq)]
#[derive(Copy, Clone, Debug, PartialEq)]
pub struct ChunkSize {
/// Whether or not the entire chunk fits within the `ChunkCapacity`
fits: Ordering,
Expand Down Expand Up @@ -623,37 +623,46 @@ where
fn next_chunk(&mut self) -> Option<(usize, &'text str)> {
let start = self.cursor;
let mut end = self.cursor;
let mut equals_found = 0;
let mut equals_found = false;

let sections = self.next_sections()?.collect::<Vec<_>>();
let mut sizes = sections
.iter()
.map(|_| None)
.collect::<Vec<Option<ChunkSize>>>();
let mut low = 0;
let mut high = sections.len().saturating_sub(1);
let mut successful_index = None;

while low <= high {
let mid = low + (high - low) / 2;
let (offset, str) = sections[mid];
let text_end = offset + str.len();
let chunk = self.text.get(start..text_end)?;
let chunk_size = self.check_capacity(start, chunk);
sizes[mid] = Some(chunk_size);

match chunk_size.fits {
Ordering::Less => {
// We got further than the last one, so update end
if text_end > end {
end = text_end;
successful_index = Some(mid);
}
}
Ordering::Equal => {
// If we found a smaller equals use it. Or if this is the first equals we found
if text_end < end || equals_found == 0 {
if text_end < end || !equals_found {
end = text_end;
successful_index = Some(mid);
}
equals_found += 1;
equals_found = true;
}
Ordering::Greater => {
// If we're too big on our smallest run, we must return at least one section
if mid == 0 && start == end {
end = text_end;
successful_index = Some(mid);
}
}
};
Expand All @@ -669,6 +678,37 @@ where
}
}

// Sometimes with tokenization, we can get a bigger chunk for the same amount of tokens.
if let Some((successful_index, chunk_size)) =
successful_index.and_then(|successful_index| {
Some((successful_index, sizes.get(successful_index)?.as_ref()?))
})
{
for (size, (offset, str)) in sizes.iter().zip(sections).skip(successful_index) {
let text_end = offset + str.len();
match size {
Some(size) if size.size <= chunk_size.size => {
if text_end > end {
end = text_end;
}
}
// We didn't tokenize this section yet
None => {
let chunk = self.text.get(start..text_end)?;
let size = self.check_capacity(start, chunk);
if size.size <= chunk_size.size {
if text_end > end {
end = text_end;
}
} else {
break;
}
}
_ => break,
}
}
}

self.cursor = end;

let chunk = self.text.get(start..self.cursor)?;
Expand Down
Loading

0 comments on commit d3acb5e

Please sign in to comment.