feat(python): add new parallel methods

benbrandt · Dec 14, 2024 · a632891 · a632891
1 parent 034943a
commit a632891
Show file tree

Hide file tree

Showing 7 changed files with 298 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## v0.19.1
+
+### What's New
+
+- Python splitters have new `chunk_all` and `chunk_all_indices` method so the multiple texts can be processed in parallel. (For Rust, you should be able to use `rayon` to do this already)
+
 ## v0.19.0
 
 ### Breaking Changes

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,7 +2,7 @@
 members = ["bindings/*"]
 
 [workspace.package]
-version = "0.19.0"
+version = "0.19.1"
 authors = ["Ben Brandt <[email protected]>"]
 edition = "2021"
 description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens, and is callable from Rust and Python."
@@ -84,7 +84,7 @@ divan = "0.1.17"
 fake = "3"
 insta = { version = "1.41", features = ["glob", "yaml"] }
 more-asserts = "0.3"
-rayon = "1.10"
+rayon = "1"
 tokenizers = { version = "0.21", default-features = false, features = [
     "onig",
     "http",

diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml
@@ -16,6 +16,7 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.23", features = ["abi3-py39"] }
+rayon = "1"
 text-splitter = { path = "../..", features = [
     "code",
     "markdown",

diff --git a/bindings/python/semantic_text_splitter.pyi b/bindings/python/semantic_text_splitter.pyi
@@ -269,6 +269,38 @@ class TextSplitter:
             trimmed as well.
         """
 
+    def chunk_all(self, texts: List[str]) -> List[List[str]]:
+        """
+        Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`.
+
+        See `chunks` for more information.
+
+        Args:
+            texts (list(str)): Texts to split.
+
+        Returns:
+            A list of lists of strings, one list for each text, and one item for each chunk.
+            If `trim` was specified in the text splitter, then each chunk will already be
+            trimmed as well.
+        """
+
+    def chunk_all_indices(self, texts: List[str]) -> List[List[Tuple[int, str]]]:
+        """
+        Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`.
+
+        See `chunks` for more information.
+
+        Args:
+            texts (list(str)): Texts to split.
+
+        Returns:
+            A list of lists of tuples, one list for each text, and one tuple for each chunk.
+            The first tuple item will be the character offset relative
+            to the original text. The second tuple item is the chunk itself.
+            If `trim` was specified in the text splitter, then each chunk will already be
+            trimmed as well.
+        """
+
 @final
 class MarkdownSplitter:
     """Markdown splitter. Recursively splits chunks into the largest semantic units that fit within the chunk size. Also will attempt to merge neighboring chunks if they can fit within the given chunk size.
@@ -543,6 +575,38 @@ class MarkdownSplitter:
             trimmed as well.
         """
 
+    def chunk_all(self, texts: List[str]) -> List[List[str]]:
+        """
+        Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`.
+
+        See `chunks` for more information.
+
+        Args:
+            texts (list(str)): Texts to split.
+
+        Returns:
+            A list of lists of strings, one list for each text, and one item for each chunk.
+            If `trim` was specified in the text splitter, then each chunk will already be
+            trimmed as well.
+        """
+
+    def chunk_all_indices(self, texts: List[str]) -> List[List[Tuple[int, str]]]:
+        """
+        Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`.
+
+        See `chunks` for more information.
+
+        Args:
+            texts (list(str)): Texts to split.
+
+        Returns:
+            A list of lists of tuples, one list for each text, and one tuple for each chunk.
+            The first tuple item will be the character offset relative
+            to the original text. The second tuple item is the chunk itself.
+            If `trim` was specified in the text splitter, then each chunk will already be
+            trimmed as well.
+        """
+
 @final
 class CodeSplitter:
     """Code splitter. Recursively splits chunks into the largest semantic units that fit within the chunk size. Also will attempt to merge neighboring chunks if they can fit within the given chunk size.
@@ -841,3 +905,35 @@ class CodeSplitter:
             If `trim` was specified in the text splitter, then each chunk will already be
             trimmed as well.
         """
+
+    def chunk_all(self, texts: List[str]) -> List[List[str]]:
+        """
+        Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`.
+
+        See `chunks` for more information.
+
+        Args:
+            texts (list(str)): Texts to split.
+
+        Returns:
+            A list of lists of strings, one list for each text, and one item for each chunk.
+            If `trim` was specified in the text splitter, then each chunk will already be
+            trimmed as well.
+        """
+
+    def chunk_all_indices(self, texts: List[str]) -> List[List[Tuple[int, str]]]:
+        """
+        Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`.
+
+        See `chunks` for more information.
+
+        Args:
+            texts (list(str)): Texts to split.
+
+        Returns:
+            A list of lists of tuples, one list for each text, and one tuple for each chunk.
+            The first tuple item will be the character offset relative
+            to the original text. The second tuple item is the chunk itself.
+            If `trim` was specified in the text splitter, then each chunk will already be
+            trimmed as well.
+        """
diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs
@@ -11,6 +11,7 @@ use pyo3::{
     prelude::*,
     pybacked::PyBackedStr,
 };
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use text_splitter::{
     Characters, ChunkCapacity, ChunkCapacityError, ChunkConfig, ChunkConfigError, ChunkSizer,
     CodeSplitter, CodeSplitterError, MarkdownSplitter, TextSplitter,
@@ -512,6 +513,55 @@ impl PyTextSplitter {
             .map(|c| offsets.map_byte_to_char(c))
             .collect()
     }
+
+    /**
+    Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`.
+
+    See `chunks` for more information.
+
+    Args:
+        texts (list(str)): Texts to split.
+
+    Returns:
+        A list of lists of strings, one list for each text, and one item for each chunk.
+        If `trim` was specified in the text splitter, then each chunk will already be
+        trimmed as well.
+    */
+    fn chunk_all(&self, texts: Vec<String>) -> Vec<Vec<String>> {
+        texts
+            .into_par_iter()
+            .map(|text| self.splitter.chunks(&text).map(ToOwned::to_owned).collect())
+            .collect()
+    }
+
+    /**
+    Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`.
+
+    See `chunks` for more information.
+
+    Args:
+        texts (list(str)): Texts to split.
+
+    Returns:
+        A list of lists of tuples, one list for each text, and one tuple for each chunk.
+        The first tuple item will be the character offset relative
+        to the original text. The second tuple item is the chunk itself.
+        If `trim` was specified in the text splitter, then each chunk will already be
+        trimmed as well.
+    */
+    fn chunk_all_indices(&self, texts: Vec<String>) -> Vec<Vec<(usize, String)>> {
+        texts
+            .into_par_iter()
+            .map(|text| {
+                let mut offsets = ByteToCharOffsetTracker::new(&text);
+                self.splitter
+                    .chunk_indices(&text)
+                    .map(|c| offsets.map_byte_to_char(c))
+                    .map(|(i, c)| (i, c.to_owned()))
+                    .collect()
+            })
+            .collect()
+    }
 }
 
 /**
@@ -890,6 +940,55 @@ impl PyMarkdownSplitter {
             .map(|c| offsets.map_byte_to_char(c))
             .collect()
     }
+
+    /**
+    Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`.
+
+    See `chunks` for more information.
+
+    Args:
+        texts (list(str)): Texts to split.
+
+    Returns:
+        A list of lists of strings, one list for each text, and one item for each chunk.
+        If `trim` was specified in the text splitter, then each chunk will already be
+        trimmed as well.
+    */
+    fn chunk_all(&self, texts: Vec<String>) -> Vec<Vec<String>> {
+        texts
+            .into_par_iter()
+            .map(|text| self.splitter.chunks(&text).map(ToOwned::to_owned).collect())
+            .collect()
+    }
+
+    /**
+    Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`.
+
+    See `chunks` for more information.
+
+    Args:
+        texts (list(str)): Texts to split.
+
+    Returns:
+        A list of lists of tuples, one list for each text, and one tuple for each chunk.
+        The first tuple item will be the character offset relative
+        to the original text. The second tuple item is the chunk itself.
+        If `trim` was specified in the text splitter, then each chunk will already be
+        trimmed as well.
+    */
+    fn chunk_all_indices(&self, texts: Vec<String>) -> Vec<Vec<(usize, String)>> {
+        texts
+            .into_par_iter()
+            .map(|text| {
+                let mut offsets = ByteToCharOffsetTracker::new(&text);
+                self.splitter
+                    .chunk_indices(&text)
+                    .map(|c| offsets.map_byte_to_char(c))
+                    .map(|(i, c)| (i, c.to_owned()))
+                    .collect()
+            })
+            .collect()
+    }
 }
 
 /**
@@ -1325,6 +1424,55 @@ impl PyCodeSplitter {
             .map(|c| offsets.map_byte_to_char(c))
             .collect()
     }
+
+    /**
+    Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`.
+
+    See `chunks` for more information.
+
+    Args:
+        texts (list(str)): Texts to split.
+
+    Returns:
+        A list of lists of strings, one list for each text, and one item for each chunk.
+        If `trim` was specified in the text splitter, then each chunk will already be
+        trimmed as well.
+    */
+    fn chunk_all(&self, texts: Vec<String>) -> Vec<Vec<String>> {
+        texts
+            .into_par_iter()
+            .map(|text| self.splitter.chunks(&text).map(ToOwned::to_owned).collect())
+            .collect()
+    }
+
+    /**
+    Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`.
+
+    See `chunks` for more information.
+
+    Args:
+        texts (list(str)): Texts to split.
+
+    Returns:
+        A list of lists of tuples, one list for each text, and one tuple for each chunk.
+        The first tuple item will be the character offset relative
+        to the original text. The second tuple item is the chunk itself.
+        If `trim` was specified in the text splitter, then each chunk will already be
+        trimmed as well.
+    */
+    fn chunk_all_indices(&self, texts: Vec<String>) -> Vec<Vec<(usize, String)>> {
+        texts
+            .into_par_iter()
+            .map(|text| {
+                let mut offsets = ByteToCharOffsetTracker::new(&text);
+                self.splitter
+                    .chunk_indices(&text)
+                    .map(|c| offsets.map_byte_to_char(c))
+                    .map(|(i, c)| (i, c.to_owned()))
+                    .collect()
+            })
+            .collect()
+    }
 }
 
 #[doc = include_str!("../README.md")]

diff --git a/bindings/python/tests/test_integration.py b/bindings/python/tests/test_integration.py
@@ -298,3 +298,45 @@ def test_code_char_indices_with_multibyte_character() -> None:
         (4, "12ü"),
         (8, "12ü"),
     ]
+
+
+def test_chunk_all() -> None:
+    splitter = TextSplitter(4)
+    texts = ["123\n123", "456\n456"]
+    chunks = splitter.chunk_all(texts)
+    assert chunks == [["123", "123"], ["456", "456"]]
+
+
+def test_chunk_all_indices() -> None:
+    splitter = TextSplitter(4)
+    texts = ["123\n123", "456\n456"]
+    chunks = splitter.chunk_all_indices(texts)
+    assert chunks == [[(0, "123"), (4, "123")], [(0, "456"), (4, "456")]]
+
+
+def test_chunk_all_markdown() -> None:
+    splitter = MarkdownSplitter(4)
+    texts = ["123\n123", "456\n456"]
+    chunks = splitter.chunk_all(texts)
+    assert chunks == [["123", "123"], ["456", "456"]]
+
+
+def test_chunk_all_indices_markdown() -> None:
+    splitter = MarkdownSplitter(4)
+    texts = ["123\n123", "456\n456"]
+    chunks = splitter.chunk_all_indices(texts)
+    assert chunks == [[(0, "123"), (4, "123")], [(0, "456"), (4, "456")]]
+
+
+def test_chunk_all_code() -> None:
+    splitter = CodeSplitter(tree_sitter_python.language(), 4)
+    texts = ["123\n123", "456\n456"]
+    chunks = splitter.chunk_all(texts)
+    assert chunks == [["123", "123"], ["456", "456"]]
+
+
+def test_chunk_all_indices_code() -> None:
+    splitter = CodeSplitter(tree_sitter_python.language(), 4)
+    texts = ["123\n123", "456\n456"]
+    chunks = splitter.chunk_all_indices(texts)
+    assert chunks == [[(0, "123"), (4, "123")], [(0, "456"), (4, "456")]]