From fed9dde3947a333749d89fed36c236e3dd58690a Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Sun, 2 Jul 2023 22:40:18 +0200 Subject: [PATCH] Python: Update to text-splitter 0.4.2 (#31) --- bindings/python/CHANGELOG.md | 6 ++++ bindings/python/Cargo.lock | 65 +++++++++++++++++------------------- bindings/python/Cargo.toml | 6 ++-- bindings/python/README.md | 28 ++++++++++++++++ 4 files changed, 68 insertions(+), 37 deletions(-) diff --git a/bindings/python/CHANGELOG.md b/bindings/python/CHANGELOG.md index b0f0bb4..c1295fc 100644 --- a/bindings/python/CHANGELOG.md +++ b/bindings/python/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## v0.2.2 + +### What's New + +- Update to v0.4.2 of `text-splitter` to support `tiktoken-rs@0.5.0` + ## v0.2.1 ### What's New diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index e6bf18f..9f4383a 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -28,9 +28,9 @@ checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8" [[package]] name = "auto_enums" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10143e1d6fc660ac7bfc268c6ec2f9699129a3cfbb241eed50393d1562e0a4ce" +checksum = "faa44067eaa1097fc513fcdea6b9c42ea8a792f750a181937d52b315477e7b7a" dependencies = [ "derive_utils", "proc-macro2", @@ -262,12 +262,9 @@ dependencies = [ [[package]] name = "hermit-abi" -version = "0.2.6" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" -dependencies = [ - "libc", -] +checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" [[package]] name = "ident_case" @@ -301,9 +298,9 @@ dependencies = [ [[package]] name = "itertools" -version = "0.10.5" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" dependencies = [ "either", ] @@ -322,9 +319,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.146" +version = "0.2.147" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" +checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" [[package]] name = "lock_api" @@ -397,7 +394,7 @@ checksum = "8795add3e14028f11f8e848bd3294898a8294767b3776b6f733560d33bd2530b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.23", ] [[package]] @@ -412,9 +409,9 @@ dependencies = [ [[package]] name = "num_cpus" -version = "1.15.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ "hermit-abi", "libc", @@ -491,9 +488,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.60" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406" +checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb" dependencies = [ "unicode-ident", ] @@ -560,9 +557,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.28" +version = "1.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" +checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105" dependencies = [ "proc-macro2", ] @@ -688,7 +685,7 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "semantic-text-splitter" -version = "0.2.1" +version = "0.2.2" dependencies = [ "pyo3", "text-splitter", @@ -713,14 +710,14 @@ checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.23", ] [[package]] name = "serde_json" -version = "1.0.96" +version = "1.0.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1" +checksum = "46266871c240a00b8f503b877622fe33430b3c7d963bdc0f2adc511e54a1eae3" dependencies = [ "itoa", "ryu", @@ -764,9 +761,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.18" +version = "2.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" +checksum = "59fb7d6d8281a51045d62b8eb3a7d1ce347b76f312af50cd3dc0af39c87c1737" dependencies = [ "proc-macro2", "quote", @@ -775,19 +772,19 @@ dependencies = [ [[package]] name = "target-lexicon" -version = "0.12.7" +version = "0.12.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd1ba337640d60c3e96bc6f0638a939b9c9a7f2c316a1598c279828b3d1dc8c5" +checksum = "1b1c7f239eb94671427157bd93b3694320f3668d4e1eff08c7285366fd777fac" [[package]] name = "text-splitter" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70fb0eba57256b96e1438c8542a99277bddd8c3774f8cfc629bf2f6fe13c09c1" +checksum = "62a5c046e622fc8f2d16754acde6e2a752885cb4b7af7bf47454cf633d9bb57c" dependencies = [ "auto_enums", "either", - "itertools 0.10.5", + "itertools 0.11.0", "once_cell", "regex", "tiktoken-rs", @@ -812,14 +809,14 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.23", ] [[package]] name = "tiktoken-rs" -version = "0.4.2" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ba161c549e2c0686f35f5d920e63fad5cafba2c28ad2caceaf07e5d9fa6e8c4" +checksum = "1a99d843674a3468b4a9200a565bbe909a0152f95e82a52feae71e6bf2d4b49d" dependencies = [ "anyhow", "base64 0.21.2", @@ -902,9 +899,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "windows-targets" -version = "0.48.0" +version = "0.48.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" +checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 0e16495..13f157e 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "semantic-text-splitter" -version = "0.2.1" +version = "0.2.2" authors = ["Ben Brandt "] edition = "2021" description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens (when used with large language models)." @@ -15,8 +15,8 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.19.0", features = ["abi3-py37"] } -text-splitter = { version = "0.4.1", features = ["tiktoken-rs", "tokenizers"] } -tiktoken-rs = "0.4.2" +text-splitter = { version = "0.4.2", features = ["tiktoken-rs", "tokenizers"] } +tiktoken-rs = "0.5.0" tokenizers = { version = "0.13.3", default_features = false, features = [ "onig", ] } diff --git a/bindings/python/README.md b/bindings/python/README.md index 2cc9f2a..3343535 100644 --- a/bindings/python/README.md +++ b/bindings/python/README.md @@ -21,6 +21,34 @@ splitter = CharacterTextSplitter(trim_chunks=False) chunks = splitter.chunks("your document text", max_characters) ``` +### With Huggingface Tokenizer + +```python +from semantic_text_splitter import HuggingFaceTextSplitter +from tokenizers import Tokenizer + +# Maximum number of tokens in a chunk +max_characters = 1000 +# Optionally can also have the splitter not trim whitespace for you +tokenizer = Tokenizer.from_pretrained("bert-base-uncased") +splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=False) + +chunks = splitter.chunks("your document text", max_characters) +``` + +### With Tiktoken Tokenizer + +```python +from semantic_text_splitter import TiktokenTextSplitter + +# Maximum number of tokens in a chunk +max_tokens = 1000 +# Optionally can also have the splitter not trim whitespace for you +splitter = TiktokenTextSplitter("gpt-3.5-turbo", trim_chunks=False) + +chunks = splitter.chunks("your document text", max_tokens) +``` + ### Using a Range for Chunk Capacity You also have the option of specifying your chunk capacity as a range.