diff --git a/CHANGELOG.md b/CHANGELOG.md index c37dcfe..baba6ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,209 @@ # Changelog +## v0.12.0 + +### What's New + +This release is a big API change to pull all chunk configuration options into the same place, at initialization of the splitters. This was motivated by two things: + +1. These settings are all important to deciding how to split the text for a given use case, and in practice I saw them often being set together anyway. +2. To prep the library for new features like chunk overlap, where error handling has to be introduced to make sure that invariants are kept between all of the settings. These errors should be handled as sson as possible before chunking the text. + +Overall, I think this has aligned the library with the usage I have seen in the wild, and pulls all of the settings for the "domain" of chunking into a single unit. + +### Breaking Changes + +#### Rust + +- **Trimming is now enabled by default**. This brings the Rust crate in alignment with the Python package. But for every use case I saw, this was already being set to `true`, and this does logically make sense as the default behavior. +- `TextSplitter` and `MarkdownSplitter` now take a `ChunkConfig` in their `::new` method + - This bring the `ChunkSizer`, `ChunkCapacity` and `trim` settings into a single struct that can be instantiated with a builder-lite pattern. + - `with_trim_chunks` method has been removed from `TextSplitter` and `MarkdownSplitter`. You can now set `trim` in the `ChunkConfig` struct. +- `ChunkCapacity` is now a struct instead of a Trait. If you were using a custom `ChunkCapacity`, you can change your `impl` to a `From for ChunkCapacity` instead. and you should be able to still pass it in to all of the same methods. + - This also means `ChunkSizer`s take a concrete type in their method instead of an impl + +##### Migration Examples + +**Default settings:** + +```rust +/// Before +let splitter = TextSplitter::default().with_trim_chunks(true); +let chunks = splitter.chunks("your document text", 500); + +/// After +let splitter = TextSplitter::new(500); +let chunks = splitter.chunks("your document text"); +``` + +**Hugging Face Tokenizers:** + +```rust +/// Before +let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None).unwrap(); +let splitter = TextSplitter::new(tokenizer).with_trim_chunks(true); +let chunks = splitter.chunks("your document text", 500); + +/// After +let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None).unwrap(); +let splitter = TextSplitter::new(ChunkConfig::new(500).with_sizer(tokenizer)); +let chunks = splitter.chunks("your document text"); +``` + +**Tiktoken:** + +```rust +/// Before +let tokenizer = cl100k_base().unwrap(); +let splitter = TextSplitter::new(tokenizer).with_trim_chunks(true); +let chunks = splitter.chunks("your document text", 500); + +/// After +let tokenizer = cl100k_base().unwrap(); +let splitter = TextSplitter::new(ChunkConfig::new(500).with_sizer(tokenizer)); +let chunks = splitter.chunks("your document text"); +``` + +**Ranges:** + +```rust +/// Before +let splitter = TextSplitter::default().with_trim_chunks(true); +let chunks = splitter.chunks("your document text", 500..2000); + +/// After +let splitter = TextSplitter::new(500..2000); +let chunks = splitter.chunks("your document text"); +``` + +**Markdown:** + +```rust +/// Before +let splitter = MarkdownSplitter::default().with_trim_chunks(true); +let chunks = splitter.chunks("your document text", 500); + +/// After +let splitter = MarkdownSplitter::new(500); +let chunks = splitter.chunks("your document text"); +``` + +**ChunkSizer impls** + +```rust +pub trait ChunkSizer { + /// Before + fn chunk_size(&self, chunk: &str, capacity: &impl ChunkCapacity) -> ChunkSize; + /// After + fn chunk_size(&self, chunk: &str, capacity: &ChunkCapacity) -> ChunkSize; +} +``` + +**ChunkCapacity impls** + +```rust +/// Before +impl ChunkCapacity for Range { + fn start(&self) -> Option { + Some(self.start) + } + + fn end(&self) -> usize { + self.end.saturating_sub(1).max(self.start) + } +} + +/// After +impl From> for ChunkCapacity { + fn from(range: Range) -> Self { + ChunkCapacity::new(range.start) + .with_max(range.end.saturating_sub(1).max(range.start)) + .expect("invalid range") + } +} +``` + +#### Python + +- Chunk `capacity` is now a required arguement in the `__init__` and classmethods of `TextSplitter` and `MarkdownSplitter` +- `trim_chunks` parameter is now just `trim` in the `__init__` and classmethods of `TextSplitter` and `MarkdownSplitter` + +##### Migration Examples + +**Default settings:** + +```python +# Before +splitter = TextSplitter() +chunks = splitter.chunks("your document text", 500) + +# After +splitter = TextSplitter(500) +chunks = splitter.chunks("your document text") +``` + +**Ranges:** + +```python +# Before +splitter = TextSplitter() +chunks = splitter.chunks("your document text", (200,1000)) + +# After +splitter = TextSplitter((200,1000)) +chunks = splitter.chunks("your document text") +``` + +**Hugging Face Tokenizers:** + +```python +# Before +tokenizer = Tokenizer.from_pretrained("bert-base-uncased") +splitter = TextSplitter.from_huggingface_tokenizer(tokenizer) +chunks = splitter.chunks("your document text", 500) + +# After +tokenizer = Tokenizer.from_pretrained("bert-base-uncased") +splitter = TextSplitter.from_huggingface_tokenizer(tokenizer, 500) +chunks = splitter.chunks("your document text") +``` + +**Tiktoken:** + +```python +# Before +splitter = TextSplitter.from_tiktoken_model("gpt-3.5-turbo") +chunks = splitter.chunks("your document text", 500) + +# After +splitter = TextSplitter.from_tiktoken_model("gpt-3.5-turbo", 500) +chunks = splitter.chunks("your document text") +``` + +**Custom callback:** + +```python +# Before +splitter = TextSplitter.from_callback(lambda text: len(text)) +chunks = splitter.chunks("your document text", 500) + +# After +splitter = TextSplitter.from_callback(lambda text: len(text), 500) +chunks = splitter.chunks("your document text") +``` + +**Markdown:** + +```python +# Before +splitter = MarkdownSplitter() +chunks = splitter.chunks("your document text", 500) + +# After +splitter = MarkdownSplitter(500) +chunks = splitter.chunks("your document text") +``` + ## v0.11.0 ### Breaking Changes diff --git a/Cargo.lock b/Cargo.lock index 12b188c..0ee1aea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1116,9 +1116,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.4.1" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecd36cc4259e3e4514335c4a138c6b43171a8d61d8f5c9348f9fc7529416f247" +checksum = "beb461507cee2c2ff151784c52762cf4d9ff6a61f3e80968600ed24fa837fa54" [[package]] name = "rustls-webpki" @@ -1547,37 +1547,15 @@ dependencies = [ "rustls-pki-types", ] -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - [[package]] name = "winapi-util" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +checksum = "134306a13c5647ad6453e8deaec55d3a44d6021970129e6188735e74bf546697" dependencies = [ - "winapi", + "windows-sys 0.52.0", ] -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - [[package]] name = "windows-sys" version = "0.48.0"