From a63289179ea62c5e03997f47c37c889f67fd6c22 Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Sat, 14 Dec 2024 07:28:56 +0100 Subject: [PATCH] feat(python): add new parallel methods --- CHANGELOG.md | 6 + Cargo.lock | 5 +- Cargo.toml | 4 +- bindings/python/Cargo.toml | 1 + bindings/python/semantic_text_splitter.pyi | 96 +++++++++++++ bindings/python/src/lib.rs | 148 +++++++++++++++++++++ bindings/python/tests/test_integration.py | 42 ++++++ 7 files changed, 298 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f616a620..b573a256 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## v0.19.1 + +### What's New + +- Python splitters have new `chunk_all` and `chunk_all_indices` method so the multiple texts can be processed in parallel. (For Rust, you should be able to use `rayon` to do this already) + ## v0.19.0 ### Breaking Changes diff --git a/Cargo.lock b/Cargo.lock index c213a91c..e7e8db5a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2013,9 +2013,10 @@ dependencies = [ [[package]] name = "semantic-text-splitter" -version = "0.19.0" +version = "0.19.1" dependencies = [ "pyo3", + "rayon", "text-splitter", "tiktoken-rs", "tokenizers", @@ -2280,7 +2281,7 @@ dependencies = [ [[package]] name = "text-splitter" -version = "0.19.0" +version = "0.19.1" dependencies = [ "ahash", "auto_enums", diff --git a/Cargo.toml b/Cargo.toml index d18df6e1..70c41a04 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ members = ["bindings/*"] [workspace.package] -version = "0.19.0" +version = "0.19.1" authors = ["Ben Brandt "] edition = "2021" description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens, and is callable from Rust and Python." @@ -84,7 +84,7 @@ divan = "0.1.17" fake = "3" insta = { version = "1.41", features = ["glob", "yaml"] } more-asserts = "0.3" -rayon = "1.10" +rayon = "1" tokenizers = { version = "0.21", default-features = false, features = [ "onig", "http", diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index d0ea947a..b2e32464 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -16,6 +16,7 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.23", features = ["abi3-py39"] } +rayon = "1" text-splitter = { path = "../..", features = [ "code", "markdown", diff --git a/bindings/python/semantic_text_splitter.pyi b/bindings/python/semantic_text_splitter.pyi index 78f25fa3..c2ca1fd7 100644 --- a/bindings/python/semantic_text_splitter.pyi +++ b/bindings/python/semantic_text_splitter.pyi @@ -269,6 +269,38 @@ class TextSplitter: trimmed as well. """ + def chunk_all(self, texts: List[str]) -> List[List[str]]: + """ + Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of strings, one list for each text, and one item for each chunk. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + """ + + def chunk_all_indices(self, texts: List[str]) -> List[List[Tuple[int, str]]]: + """ + Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of tuples, one list for each text, and one tuple for each chunk. + The first tuple item will be the character offset relative + to the original text. The second tuple item is the chunk itself. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + """ + @final class MarkdownSplitter: """Markdown splitter. Recursively splits chunks into the largest semantic units that fit within the chunk size. Also will attempt to merge neighboring chunks if they can fit within the given chunk size. @@ -543,6 +575,38 @@ class MarkdownSplitter: trimmed as well. """ + def chunk_all(self, texts: List[str]) -> List[List[str]]: + """ + Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of strings, one list for each text, and one item for each chunk. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + """ + + def chunk_all_indices(self, texts: List[str]) -> List[List[Tuple[int, str]]]: + """ + Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of tuples, one list for each text, and one tuple for each chunk. + The first tuple item will be the character offset relative + to the original text. The second tuple item is the chunk itself. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + """ + @final class CodeSplitter: """Code splitter. Recursively splits chunks into the largest semantic units that fit within the chunk size. Also will attempt to merge neighboring chunks if they can fit within the given chunk size. @@ -841,3 +905,35 @@ class CodeSplitter: If `trim` was specified in the text splitter, then each chunk will already be trimmed as well. """ + + def chunk_all(self, texts: List[str]) -> List[List[str]]: + """ + Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of strings, one list for each text, and one item for each chunk. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + """ + + def chunk_all_indices(self, texts: List[str]) -> List[List[Tuple[int, str]]]: + """ + Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of tuples, one list for each text, and one tuple for each chunk. + The first tuple item will be the character offset relative + to the original text. The second tuple item is the chunk itself. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + """ diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index 3a0e41dd..30ac551c 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -11,6 +11,7 @@ use pyo3::{ prelude::*, pybacked::PyBackedStr, }; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; use text_splitter::{ Characters, ChunkCapacity, ChunkCapacityError, ChunkConfig, ChunkConfigError, ChunkSizer, CodeSplitter, CodeSplitterError, MarkdownSplitter, TextSplitter, @@ -512,6 +513,55 @@ impl PyTextSplitter { .map(|c| offsets.map_byte_to_char(c)) .collect() } + + /** + Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of strings, one list for each text, and one item for each chunk. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + */ + fn chunk_all(&self, texts: Vec) -> Vec> { + texts + .into_par_iter() + .map(|text| self.splitter.chunks(&text).map(ToOwned::to_owned).collect()) + .collect() + } + + /** + Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of tuples, one list for each text, and one tuple for each chunk. + The first tuple item will be the character offset relative + to the original text. The second tuple item is the chunk itself. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + */ + fn chunk_all_indices(&self, texts: Vec) -> Vec> { + texts + .into_par_iter() + .map(|text| { + let mut offsets = ByteToCharOffsetTracker::new(&text); + self.splitter + .chunk_indices(&text) + .map(|c| offsets.map_byte_to_char(c)) + .map(|(i, c)| (i, c.to_owned())) + .collect() + }) + .collect() + } } /** @@ -890,6 +940,55 @@ impl PyMarkdownSplitter { .map(|c| offsets.map_byte_to_char(c)) .collect() } + + /** + Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of strings, one list for each text, and one item for each chunk. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + */ + fn chunk_all(&self, texts: Vec) -> Vec> { + texts + .into_par_iter() + .map(|text| self.splitter.chunks(&text).map(ToOwned::to_owned).collect()) + .collect() + } + + /** + Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of tuples, one list for each text, and one tuple for each chunk. + The first tuple item will be the character offset relative + to the original text. The second tuple item is the chunk itself. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + */ + fn chunk_all_indices(&self, texts: Vec) -> Vec> { + texts + .into_par_iter() + .map(|text| { + let mut offsets = ByteToCharOffsetTracker::new(&text); + self.splitter + .chunk_indices(&text) + .map(|c| offsets.map_byte_to_char(c)) + .map(|(i, c)| (i, c.to_owned())) + .collect() + }) + .collect() + } } /** @@ -1325,6 +1424,55 @@ impl PyCodeSplitter { .map(|c| offsets.map_byte_to_char(c)) .collect() } + + /** + Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of strings, one list for each text, and one item for each chunk. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + */ + fn chunk_all(&self, texts: Vec) -> Vec> { + texts + .into_par_iter() + .map(|text| self.splitter.chunks(&text).map(ToOwned::to_owned).collect()) + .collect() + } + + /** + Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`. + + See `chunks` for more information. + + Args: + texts (list(str)): Texts to split. + + Returns: + A list of lists of tuples, one list for each text, and one tuple for each chunk. + The first tuple item will be the character offset relative + to the original text. The second tuple item is the chunk itself. + If `trim` was specified in the text splitter, then each chunk will already be + trimmed as well. + */ + fn chunk_all_indices(&self, texts: Vec) -> Vec> { + texts + .into_par_iter() + .map(|text| { + let mut offsets = ByteToCharOffsetTracker::new(&text); + self.splitter + .chunk_indices(&text) + .map(|c| offsets.map_byte_to_char(c)) + .map(|(i, c)| (i, c.to_owned())) + .collect() + }) + .collect() + } } #[doc = include_str!("../README.md")] diff --git a/bindings/python/tests/test_integration.py b/bindings/python/tests/test_integration.py index b281c878..2720312c 100644 --- a/bindings/python/tests/test_integration.py +++ b/bindings/python/tests/test_integration.py @@ -298,3 +298,45 @@ def test_code_char_indices_with_multibyte_character() -> None: (4, "12ü"), (8, "12ü"), ] + + +def test_chunk_all() -> None: + splitter = TextSplitter(4) + texts = ["123\n123", "456\n456"] + chunks = splitter.chunk_all(texts) + assert chunks == [["123", "123"], ["456", "456"]] + + +def test_chunk_all_indices() -> None: + splitter = TextSplitter(4) + texts = ["123\n123", "456\n456"] + chunks = splitter.chunk_all_indices(texts) + assert chunks == [[(0, "123"), (4, "123")], [(0, "456"), (4, "456")]] + + +def test_chunk_all_markdown() -> None: + splitter = MarkdownSplitter(4) + texts = ["123\n123", "456\n456"] + chunks = splitter.chunk_all(texts) + assert chunks == [["123", "123"], ["456", "456"]] + + +def test_chunk_all_indices_markdown() -> None: + splitter = MarkdownSplitter(4) + texts = ["123\n123", "456\n456"] + chunks = splitter.chunk_all_indices(texts) + assert chunks == [[(0, "123"), (4, "123")], [(0, "456"), (4, "456")]] + + +def test_chunk_all_code() -> None: + splitter = CodeSplitter(tree_sitter_python.language(), 4) + texts = ["123\n123", "456\n456"] + chunks = splitter.chunk_all(texts) + assert chunks == [["123", "123"], ["456", "456"]] + + +def test_chunk_all_indices_code() -> None: + splitter = CodeSplitter(tree_sitter_python.language(), 4) + texts = ["123\n123", "456\n456"] + chunks = splitter.chunk_all_indices(texts) + assert chunks == [[(0, "123"), (4, "123")], [(0, "456"), (4, "456")]]