From 2c1c310ee4596997348bb1715171b898c1b17e5e Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Wed, 27 Dec 2023 20:30:52 +0100 Subject: [PATCH 1/4] Python: v0.3.0 release --- bindings/python/CHANGELOG.md | 10 +++++ bindings/python/Cargo.lock | 82 ++++++++++++++++++------------------ bindings/python/Cargo.toml | 4 +- 3 files changed, 52 insertions(+), 44 deletions(-) diff --git a/bindings/python/CHANGELOG.md b/bindings/python/CHANGELOG.md index 6caa4a7b..50f49d51 100644 --- a/bindings/python/CHANGELOG.md +++ b/bindings/python/CHANGELOG.md @@ -1,5 +1,15 @@ # Changelog +## v0.3.0 + +### What's New + +- Update to `v0.5.0` of `text-splitter` for significant performance improvements for generating chunks with the `tokenizers` or `tiktoken-rs` crates by applying binary search when attempting to find the next matching chunk size. + +### Breaking Changes + +- Due to using binary search, there are some slight differences at the edges of chunks where the algorithm was a little greedier before. If two candidates would tokenize to the same amount of tokens that fit within the capacity, it will now choose the shorter text. Due to the nature of of tokenizers, this happens more often with whitespace at the end of a chunk, and rarely effects users who have set `trim_chunks=true`. It is a tradeoff, but would have made the binary search code much more complicated to keep the exact same behavior. + ## v0.2.4 ### What's New diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 03644e06..fb0ad2cc 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -13,9 +13,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.76" +version = "1.0.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59d2a3357dde987206219e78ecfbbb6e8dad06cbb65292758d3270e6254f7355" +checksum = "c9d19de80eff169429ac1e9f48fffb163916b448a44e8e046186232046d9e1f9" [[package]] name = "auto_enums" @@ -26,7 +26,7 @@ dependencies = [ "derive_utils", "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.43", ] [[package]] @@ -96,9 +96,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "crossbeam-deque" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +checksum = "fca89a0e215bab21874660c67903c5f143333cab1da83d041c7ded6053774751" dependencies = [ "cfg-if", "crossbeam-epoch", @@ -107,22 +107,20 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.15" +version = "0.9.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +checksum = "0e3681d554572a651dda4186cd47240627c3d0114d45a95f6ad27f2f22e7548d" dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", - "memoffset", - "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.16" +version = "0.8.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" +checksum = "c3a430a770ebd84726f584a90ee7f020d28db52c6d02138900f22341f866d39c" dependencies = [ "cfg-if", ] @@ -201,7 +199,7 @@ checksum = "9abcad25e9720609ccb3dcdb795d845e37d8ce34183330a9f48b03a1a71c8e21" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.43", ] [[package]] @@ -281,9 +279,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.9" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] name = "lazy_static" @@ -293,9 +291,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.150" +version = "0.2.151" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" +checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" [[package]] name = "lock_api" @@ -368,7 +366,7 @@ checksum = "531c82a934da419bed3da09bd87d6e98c72f8d4aa755427b3b009c2b8b8c433c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.43", ] [[package]] @@ -383,9 +381,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "onig" @@ -440,9 +438,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" [[package]] name = "pkg-config" -version = "0.3.27" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" +checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a" [[package]] name = "ppv-lite86" @@ -452,9 +450,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.69" +version = "1.0.71" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" +checksum = "75cb1540fadbd5b8fbccc4dddad2734eba435053f725621c070711a14bb5f4b8" dependencies = [ "unicode-ident", ] @@ -505,7 +503,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.39", + "syn 2.0.43", ] [[package]] @@ -517,7 +515,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.43", ] [[package]] @@ -642,9 +640,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "ryu" -version = "1.0.15" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" +checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" [[package]] name = "scopeguard" @@ -654,7 +652,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "semantic-text-splitter" -version = "0.2.4" +version = "0.3.0" dependencies = [ "pyo3", "text-splitter", @@ -664,22 +662,22 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.192" +version = "1.0.193" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bca2a08484b285dcb282d0f67b26cadc0df8b19f8c12502c13d966bf9482f001" +checksum = "25dd9975e68d0cb5aa1120c288333fc98731bd1dd12f561e468ea4728c042b89" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.192" +version = "1.0.193" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6c7207fbec9faa48073f3e3074cbe553af6ea512d7c21ba46e434e70ea9fbc1" +checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.43", ] [[package]] @@ -730,9 +728,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.39" +version = "2.0.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23e78b90f2fcf45d3e842032ce32e3f2d1545ba6636271dcbf24fa306d87be7a" +checksum = "ee659fb5f3d355364e1f3e5bc10fb82068efbf824a1e9d1c9504244a6469ad53" dependencies = [ "proc-macro2", "quote", @@ -747,9 +745,9 @@ checksum = "14c39fd04924ca3a864207c66fc2cd7d22d7c016007f9ce846cbb9326331930a" [[package]] name = "text-splitter" -version = "0.4.5" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da5a47ceb619c4d711989aa5ab579c88cd2de53283413a5c9a3dae4c8411d522" +checksum = "7d2472fe4430d7a5dd846f6a3d480593b955c00ddf15440f1680645061b3c0a3" dependencies = [ "auto_enums", "either", @@ -763,22 +761,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.50" +version = "1.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9a7210f5c9a7156bb50aa36aed4c95afb51df0df00713949448cf9e97d382d2" +checksum = "83a48fd946b02c0a526b2e9481c8e2a17755e47039164a86c4070446e3a4614d" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.50" +version = "1.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "266b2e40bc00e5a6c09c3584011e08b06f123c00362c92b975ba9843aaaa14b8" +checksum = "e7fbe9b594d6568a6a1443250a7e67d80b74e1e96f6d1715e1e21cc1888291d3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.43", ] [[package]] diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 65b8c9cd..5ec436c7 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "semantic-text-splitter" -version = "0.2.4" +version = "0.3.0" authors = ["Ben Brandt "] edition = "2021" description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens (when used with large language models)." @@ -15,7 +15,7 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.20.0", features = ["abi3-py37"] } -text-splitter = { version = "0.4.5", features = ["tiktoken-rs", "tokenizers"] } +text-splitter = { version = "0.5.0", features = ["tiktoken-rs", "tokenizers"] } tiktoken-rs = "0.5.8" tokenizers = { version = "0.15.0", default_features = false, features = [ "onig", From d9f460c0389a1ee8aea26bfc30d3515c705c596e Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Wed, 27 Dec 2023 20:37:32 +0100 Subject: [PATCH 2/4] Update upload-artifact usage --- .github/workflows/python.yml | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 1fd0e88c..284d720e 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -71,9 +71,9 @@ jobs: manylinux: auto working-directory: bindings/python - name: Upload wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: wheels + name: wheels-linux-${{ matrix.target }} path: bindings/python/dist if-no-files-found: error - name: pytest @@ -108,9 +108,9 @@ jobs: sccache: "true" working-directory: bindings/python - name: Upload wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: wheels + name: wheels-windows-${{ matrix.target }} path: bindings/python/dist if-no-files-found: error - name: pytest @@ -144,9 +144,9 @@ jobs: sccache: "true" working-directory: bindings/python - name: Upload wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: wheels + name: wheels-macos-${{ matrix.target }} path: bindings/python/dist if-no-files-found: error - name: pytest @@ -173,9 +173,9 @@ jobs: args: --out dist working-directory: bindings/python - name: Upload sdist - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: wheels + name: wheels-sdist path: bindings/python/dist if-no-files-found: error @@ -187,7 +187,9 @@ jobs: steps: - uses: actions/download-artifact@v4 with: - name: wheels + path: wheels + pattern: wheels-* + merge-multiple: true - name: Publish to PyPI uses: PyO3/maturin-action@v1 env: From 73152f685b22f311adf0a5c1a721abbd5987a469 Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Wed, 27 Dec 2023 20:42:35 +0100 Subject: [PATCH 3/4] Fix tests --- bindings/python/tests/test_integration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/python/tests/test_integration.py b/bindings/python/tests/test_integration.py index 62731c3c..b58967f2 100644 --- a/bindings/python/tests/test_integration.py +++ b/bindings/python/tests/test_integration.py @@ -32,14 +32,14 @@ def test_hugging_face(): tokenizer = Tokenizer.from_pretrained("bert-base-uncased") splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=False) text = "123\n123" - assert splitter.chunks(text, 1) == ["123\n", "123"] + assert splitter.chunks(text, 1) == ["123", "\n123"] def test_hugging_face_range(): tokenizer = Tokenizer.from_pretrained("bert-base-uncased") splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=False) text = "123\n123" - assert splitter.chunks(text=text, chunk_capacity=(1, 2)) == ["123\n", "123"] + assert splitter.chunks(text=text, chunk_capacity=(1, 2)) == ["123", "\n123"] def test_hugging_face_trim(): From ab7121b53e173f148ce5b5bfab2dc0df01a0e7bd Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Wed, 27 Dec 2023 20:47:55 +0100 Subject: [PATCH 4/4] Bump minimum python version to 3.8 --- bindings/python/CHANGELOG.md | 1 + bindings/python/Cargo.toml | 2 +- bindings/python/pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bindings/python/CHANGELOG.md b/bindings/python/CHANGELOG.md index 50f49d51..a519cb58 100644 --- a/bindings/python/CHANGELOG.md +++ b/bindings/python/CHANGELOG.md @@ -8,6 +8,7 @@ ### Breaking Changes +- Minimum Python version is now 3.8. - Due to using binary search, there are some slight differences at the edges of chunks where the algorithm was a little greedier before. If two candidates would tokenize to the same amount of tokens that fit within the capacity, it will now choose the shorter text. Due to the nature of of tokenizers, this happens more often with whitespace at the end of a chunk, and rarely effects users who have set `trim_chunks=true`. It is a tradeoff, but would have made the binary search code much more complicated to keep the exact same behavior. ## v0.2.4 diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 5ec436c7..20a9061d 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -14,7 +14,7 @@ name = "semantic_text_splitter" crate-type = ["cdylib"] [dependencies] -pyo3 = { version = "0.20.0", features = ["abi3-py37"] } +pyo3 = { version = "0.20.0", features = ["abi3-py38"] } text-splitter = { version = "0.5.0", features = ["tiktoken-rs", "tokenizers"] } tiktoken-rs = "0.5.8" tokenizers = { version = "0.15.0", default_features = false, features = [ diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index 03fffc71..d165f476 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "semantic-text-splitter" -requires-python = ">=3.7" +requires-python = ">=3.8" classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: Implementation :: CPython",