diff --git a/CHANGELOG.md b/CHANGELOG.md index baba6ae6..252c7d36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## v0.12.1 + +### What's New + +- [`rust_tokenizers`](https://crates.io/crates/rust_tokenizers) support has been added to the Rust crate. + ## v0.12.0 ### What's New diff --git a/Cargo.lock b/Cargo.lock index 4eb5b531..5e562822 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1864,7 +1864,7 @@ dependencies = [ [[package]] name = "semantic-text-splitter" -version = "0.12.0" +version = "0.12.1" dependencies = [ "auto_enums", "pyo3", @@ -2100,7 +2100,7 @@ dependencies = [ [[package]] name = "text-splitter" -version = "0.12.0" +version = "0.12.1" dependencies = [ "ahash", "auto_enums", diff --git a/Cargo.toml b/Cargo.toml index 3e659be5..e20d87e1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ members = ["bindings/*"] [workspace.package] -version = "0.12.0" +version = "0.12.1" authors = ["Ben Brandt "] edition = "2021" description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens, and is callable from Rust and Python." @@ -81,8 +81,8 @@ harness = false [features] markdown = ["dep:pulldown-cmark"] rust-tokenizers = ["dep:rust_tokenizers"] -tokenizers = ["dep:tokenizers"] tiktoken-rs = ["dep:tiktoken-rs"] +tokenizers = ["dep:tokenizers"] [lints.rust] future_incompatible = "warn" diff --git a/README.md b/README.md index c9c7f183..8d25baab 100644 --- a/README.md +++ b/README.md @@ -170,6 +170,7 @@ There are lots of methods of determining sentence breaks, all to varying degrees | Dependency Feature | Version Supported | Description | | ------------------ | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `rust_tokenizers` | `8.1.1` | Enables `(Text/Markdown)Splitter::new` to take any of the provided tokenizers as an argument. | | `tiktoken-rs` | `0.5.8` | Enables `(Text/Markdown)Splitter::new` to take `tiktoken_rs::CoreBPE` as an argument. This is useful for splitting text for OpenAI models. | | `tokenizers` | `0.19.1` | Enables `(Text/Markdown)Splitter::new` to take `tokenizers::Tokenizer` as an argument. This is useful for splitting text models that have a Hugging Face-compatible tokenizer. |