Skip to content

Commit

Permalink
feat(python): add new parallel methods
Browse files Browse the repository at this point in the history
  • Loading branch information
benbrandt committed Dec 14, 2024
1 parent 034943a commit a632891
Show file tree
Hide file tree
Showing 7 changed files with 298 additions and 4 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## v0.19.1

### What's New

- Python splitters have new `chunk_all` and `chunk_all_indices` method so the multiple texts can be processed in parallel. (For Rust, you should be able to use `rayon` to do this already)

## v0.19.0

### Breaking Changes
Expand Down
5 changes: 3 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
members = ["bindings/*"]

[workspace.package]
version = "0.19.0"
version = "0.19.1"
authors = ["Ben Brandt <[email protected]>"]
edition = "2021"
description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens, and is callable from Rust and Python."
Expand Down Expand Up @@ -84,7 +84,7 @@ divan = "0.1.17"
fake = "3"
insta = { version = "1.41", features = ["glob", "yaml"] }
more-asserts = "0.3"
rayon = "1.10"
rayon = "1"
tokenizers = { version = "0.21", default-features = false, features = [
"onig",
"http",
Expand Down
1 change: 1 addition & 0 deletions bindings/python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ crate-type = ["cdylib"]

[dependencies]
pyo3 = { version = "0.23", features = ["abi3-py39"] }
rayon = "1"
text-splitter = { path = "../..", features = [
"code",
"markdown",
Expand Down
96 changes: 96 additions & 0 deletions bindings/python/semantic_text_splitter.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,38 @@ class TextSplitter:
trimmed as well.
"""

def chunk_all(self, texts: List[str]) -> List[List[str]]:
"""
Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`.
See `chunks` for more information.
Args:
texts (list(str)): Texts to split.
Returns:
A list of lists of strings, one list for each text, and one item for each chunk.
If `trim` was specified in the text splitter, then each chunk will already be
trimmed as well.
"""

def chunk_all_indices(self, texts: List[str]) -> List[List[Tuple[int, str]]]:
"""
Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`.
See `chunks` for more information.
Args:
texts (list(str)): Texts to split.
Returns:
A list of lists of tuples, one list for each text, and one tuple for each chunk.
The first tuple item will be the character offset relative
to the original text. The second tuple item is the chunk itself.
If `trim` was specified in the text splitter, then each chunk will already be
trimmed as well.
"""

@final
class MarkdownSplitter:
"""Markdown splitter. Recursively splits chunks into the largest semantic units that fit within the chunk size. Also will attempt to merge neighboring chunks if they can fit within the given chunk size.
Expand Down Expand Up @@ -543,6 +575,38 @@ class MarkdownSplitter:
trimmed as well.
"""

def chunk_all(self, texts: List[str]) -> List[List[str]]:
"""
Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`.
See `chunks` for more information.
Args:
texts (list(str)): Texts to split.
Returns:
A list of lists of strings, one list for each text, and one item for each chunk.
If `trim` was specified in the text splitter, then each chunk will already be
trimmed as well.
"""

def chunk_all_indices(self, texts: List[str]) -> List[List[Tuple[int, str]]]:
"""
Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`.
See `chunks` for more information.
Args:
texts (list(str)): Texts to split.
Returns:
A list of lists of tuples, one list for each text, and one tuple for each chunk.
The first tuple item will be the character offset relative
to the original text. The second tuple item is the chunk itself.
If `trim` was specified in the text splitter, then each chunk will already be
trimmed as well.
"""

@final
class CodeSplitter:
"""Code splitter. Recursively splits chunks into the largest semantic units that fit within the chunk size. Also will attempt to merge neighboring chunks if they can fit within the given chunk size.
Expand Down Expand Up @@ -841,3 +905,35 @@ class CodeSplitter:
If `trim` was specified in the text splitter, then each chunk will already be
trimmed as well.
"""

def chunk_all(self, texts: List[str]) -> List[List[str]]:
"""
Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`.
See `chunks` for more information.
Args:
texts (list(str)): Texts to split.
Returns:
A list of lists of strings, one list for each text, and one item for each chunk.
If `trim` was specified in the text splitter, then each chunk will already be
trimmed as well.
"""

def chunk_all_indices(self, texts: List[str]) -> List[List[Tuple[int, str]]]:
"""
Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`.
See `chunks` for more information.
Args:
texts (list(str)): Texts to split.
Returns:
A list of lists of tuples, one list for each text, and one tuple for each chunk.
The first tuple item will be the character offset relative
to the original text. The second tuple item is the chunk itself.
If `trim` was specified in the text splitter, then each chunk will already be
trimmed as well.
"""
148 changes: 148 additions & 0 deletions bindings/python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use pyo3::{
prelude::*,
pybacked::PyBackedStr,
};
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use text_splitter::{
Characters, ChunkCapacity, ChunkCapacityError, ChunkConfig, ChunkConfigError, ChunkSizer,
CodeSplitter, CodeSplitterError, MarkdownSplitter, TextSplitter,
Expand Down Expand Up @@ -512,6 +513,55 @@ impl PyTextSplitter {
.map(|c| offsets.map_byte_to_char(c))
.collect()
}

/**
Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`.
See `chunks` for more information.
Args:
texts (list(str)): Texts to split.
Returns:
A list of lists of strings, one list for each text, and one item for each chunk.
If `trim` was specified in the text splitter, then each chunk will already be
trimmed as well.
*/
fn chunk_all(&self, texts: Vec<String>) -> Vec<Vec<String>> {
texts
.into_par_iter()
.map(|text| self.splitter.chunks(&text).map(ToOwned::to_owned).collect())
.collect()
}

/**
Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`.
See `chunks` for more information.
Args:
texts (list(str)): Texts to split.
Returns:
A list of lists of tuples, one list for each text, and one tuple for each chunk.
The first tuple item will be the character offset relative
to the original text. The second tuple item is the chunk itself.
If `trim` was specified in the text splitter, then each chunk will already be
trimmed as well.
*/
fn chunk_all_indices(&self, texts: Vec<String>) -> Vec<Vec<(usize, String)>> {
texts
.into_par_iter()
.map(|text| {
let mut offsets = ByteToCharOffsetTracker::new(&text);
self.splitter
.chunk_indices(&text)
.map(|c| offsets.map_byte_to_char(c))
.map(|(i, c)| (i, c.to_owned()))
.collect()
})
.collect()
}
}

/**
Expand Down Expand Up @@ -890,6 +940,55 @@ impl PyMarkdownSplitter {
.map(|c| offsets.map_byte_to_char(c))
.collect()
}

/**
Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`.
See `chunks` for more information.
Args:
texts (list(str)): Texts to split.
Returns:
A list of lists of strings, one list for each text, and one item for each chunk.
If `trim` was specified in the text splitter, then each chunk will already be
trimmed as well.
*/
fn chunk_all(&self, texts: Vec<String>) -> Vec<Vec<String>> {
texts
.into_par_iter()
.map(|text| self.splitter.chunks(&text).map(ToOwned::to_owned).collect())
.collect()
}

/**
Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`.
See `chunks` for more information.
Args:
texts (list(str)): Texts to split.
Returns:
A list of lists of tuples, one list for each text, and one tuple for each chunk.
The first tuple item will be the character offset relative
to the original text. The second tuple item is the chunk itself.
If `trim` was specified in the text splitter, then each chunk will already be
trimmed as well.
*/
fn chunk_all_indices(&self, texts: Vec<String>) -> Vec<Vec<(usize, String)>> {
texts
.into_par_iter()
.map(|text| {
let mut offsets = ByteToCharOffsetTracker::new(&text);
self.splitter
.chunk_indices(&text)
.map(|c| offsets.map_byte_to_char(c))
.map(|(i, c)| (i, c.to_owned()))
.collect()
})
.collect()
}
}

/**
Expand Down Expand Up @@ -1325,6 +1424,55 @@ impl PyCodeSplitter {
.map(|c| offsets.map_byte_to_char(c))
.collect()
}

/**
Generate a list of chunks for a given set of texts. Each chunk will be up to the `capacity`.
See `chunks` for more information.
Args:
texts (list(str)): Texts to split.
Returns:
A list of lists of strings, one list for each text, and one item for each chunk.
If `trim` was specified in the text splitter, then each chunk will already be
trimmed as well.
*/
fn chunk_all(&self, texts: Vec<String>) -> Vec<Vec<String>> {
texts
.into_par_iter()
.map(|text| self.splitter.chunks(&text).map(ToOwned::to_owned).collect())
.collect()
}

/**
Generate a list of chunks for a given set of text, along with their character offsets in the original text. Each chunk will be up to the `capacity`.
See `chunks` for more information.
Args:
texts (list(str)): Texts to split.
Returns:
A list of lists of tuples, one list for each text, and one tuple for each chunk.
The first tuple item will be the character offset relative
to the original text. The second tuple item is the chunk itself.
If `trim` was specified in the text splitter, then each chunk will already be
trimmed as well.
*/
fn chunk_all_indices(&self, texts: Vec<String>) -> Vec<Vec<(usize, String)>> {
texts
.into_par_iter()
.map(|text| {
let mut offsets = ByteToCharOffsetTracker::new(&text);
self.splitter
.chunk_indices(&text)
.map(|c| offsets.map_byte_to_char(c))
.map(|(i, c)| (i, c.to_owned()))
.collect()
})
.collect()
}
}

#[doc = include_str!("../README.md")]
Expand Down
42 changes: 42 additions & 0 deletions bindings/python/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,45 @@ def test_code_char_indices_with_multibyte_character() -> None:
(4, "12ü"),
(8, "12ü"),
]


def test_chunk_all() -> None:
splitter = TextSplitter(4)
texts = ["123\n123", "456\n456"]
chunks = splitter.chunk_all(texts)
assert chunks == [["123", "123"], ["456", "456"]]


def test_chunk_all_indices() -> None:
splitter = TextSplitter(4)
texts = ["123\n123", "456\n456"]
chunks = splitter.chunk_all_indices(texts)
assert chunks == [[(0, "123"), (4, "123")], [(0, "456"), (4, "456")]]


def test_chunk_all_markdown() -> None:
splitter = MarkdownSplitter(4)
texts = ["123\n123", "456\n456"]
chunks = splitter.chunk_all(texts)
assert chunks == [["123", "123"], ["456", "456"]]


def test_chunk_all_indices_markdown() -> None:
splitter = MarkdownSplitter(4)
texts = ["123\n123", "456\n456"]
chunks = splitter.chunk_all_indices(texts)
assert chunks == [[(0, "123"), (4, "123")], [(0, "456"), (4, "456")]]


def test_chunk_all_code() -> None:
splitter = CodeSplitter(tree_sitter_python.language(), 4)
texts = ["123\n123", "456\n456"]
chunks = splitter.chunk_all(texts)
assert chunks == [["123", "123"], ["456", "456"]]


def test_chunk_all_indices_code() -> None:
splitter = CodeSplitter(tree_sitter_python.language(), 4)
texts = ["123\n123", "456\n456"]
chunks = splitter.chunk_all_indices(texts)
assert chunks == [[(0, "123"), (4, "123")], [(0, "456"), (4, "456")]]

0 comments on commit a632891

Please sign in to comment.