Skip to content

Commit

Permalink
Use chunk config in Markdown as well
Browse files Browse the repository at this point in the history
  • Loading branch information
benbrandt committed Apr 20, 2024
1 parent 99cb125 commit 981043d
Show file tree
Hide file tree
Showing 11 changed files with 385 additions and 663 deletions.
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,9 @@ use text_splitter::MarkdownSplitter;
let max_characters = 1000;
// Default implementation uses character count for chunk size.
// Can also use all of the same tokenizer implementations as `TextSplitter`.
let splitter = MarkdownSplitter::default()
// Optionally can also have the splitter trim whitespace for you
.with_trim_chunks(true);
let splitter = MarkdownSplitter::new(max_characters);

let chunks = splitter.chunks("# Header\n\nyour document text", max_characters);
let chunks = splitter.chunks("# Header\n\nyour document text");
```

## Method
Expand Down
28 changes: 15 additions & 13 deletions benches/chunk_size.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,23 +43,23 @@ mod text {

#[divan::bench(args = TEXT_FILENAMES, consts = CHUNK_SIZES)]
fn characters<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
bench::<_, _>(bencher, filename, || TextSplitter::new(N));
bench(bencher, filename, || TextSplitter::new(N));
}

#[cfg(feature = "tiktoken-rs")]
#[divan::bench(args = TEXT_FILENAMES, consts = CHUNK_SIZES)]
fn tiktoken<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
use text_splitter::ChunkConfig;

bench::<_, _>(bencher, filename, || {
bench(bencher, filename, || {
TextSplitter::new(ChunkConfig::new(N).with_sizer(tiktoken_rs::cl100k_base().unwrap()))
});
}

#[cfg(feature = "tokenizers")]
#[divan::bench(args = TEXT_FILENAMES, consts = CHUNK_SIZES)]
fn tokenizers<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
bench::<_, _>(bencher, filename, || {
bench(bencher, filename, || {
TextSplitter::new(ChunkConfig::new(N).with_sizer(
tokenizers::Tokenizer::from_pretrained("bert-base-cased", None).unwrap(),
))
Expand All @@ -73,15 +73,15 @@ mod markdown {
use std::fs;

use divan::{black_box_drop, counter::BytesCount, Bencher};
use text_splitter::{ChunkSizer, MarkdownSplitter};
use text_splitter::{ChunkConfig, ChunkSizer, MarkdownSplitter};

use crate::CHUNK_SIZES;

const MARKDOWN_FILENAMES: &[&str] = &["commonmark_spec"];

fn bench<const N: usize, S, G>(bencher: Bencher<'_, '_>, filename: &str, gen_splitter: G)
fn bench<S, G>(bencher: Bencher<'_, '_>, filename: &str, gen_splitter: G)
where
G: Fn() -> MarkdownSplitter<S> + Sync,
G: Fn() -> MarkdownSplitter<usize, S> + Sync,
S: ChunkSizer,
{
bencher
Expand All @@ -93,30 +93,32 @@ mod markdown {
})
.input_counter(|(_, text)| BytesCount::of_str(text))
.bench_values(|(splitter, text)| {
splitter.chunks(&text, N).for_each(black_box_drop);
splitter.chunks(&text).for_each(black_box_drop);
});
}

#[divan::bench(args = MARKDOWN_FILENAMES, consts = CHUNK_SIZES)]
fn characters<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
bench::<N, _, _>(bencher, filename, MarkdownSplitter::default);
bench(bencher, filename, || MarkdownSplitter::new(N));
}

#[cfg(feature = "tiktoken-rs")]
#[divan::bench(args = MARKDOWN_FILENAMES, consts = CHUNK_SIZES)]
fn tiktoken<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
bench::<N, _, _>(bencher, filename, || {
MarkdownSplitter::new(tiktoken_rs::cl100k_base().unwrap())
bench(bencher, filename, || {
MarkdownSplitter::new(
ChunkConfig::new(N).with_sizer(tiktoken_rs::cl100k_base().unwrap()),
)
});
}

#[cfg(feature = "tokenizers")]
#[divan::bench(args = MARKDOWN_FILENAMES, consts = CHUNK_SIZES)]
fn tokenizers<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
bench::<N, _, _>(bencher, filename, || {
MarkdownSplitter::new(
bench(bencher, filename, || {
MarkdownSplitter::new(ChunkConfig::new(N).with_sizer(
tokenizers::Tokenizer::from_pretrained("bert-base-cased", None).unwrap(),
)
))
});
}
}
117 changes: 67 additions & 50 deletions bindings/python/semantic_text_splitter.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ class TextSplitter:
up the chunk until the lower range is met.
trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
beginning and end or not. If False, joining all chunks will return the original
string. Defaults to True..
string. Defaults to True.
Returns:
The new text splitter
Expand All @@ -149,8 +149,7 @@ class TextSplitter:
up the chunk until the lower range is met.
trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
beginning and end or not. If False, joining all chunks will return the original
string. Defaults to True..
string. Defaults to True.
Returns:
The new text splitter
Expand All @@ -171,7 +170,7 @@ class TextSplitter:
up the chunk until the lower range is met.
trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
beginning and end or not. If False, joining all chunks will return the original
string. Defaults to True..
string. Defaults to True.
Returns:
The new text splitter
Expand Down Expand Up @@ -258,10 +257,10 @@ class MarkdownSplitter:
# Maximum number of characters in a chunk
max_characters = 1000
# Optionally can also have the splitter not trim whitespace for you
splitter = MarkdownSplitter()
# splitter = MarkdownSplitter(trim_chunks=False)
splitter = MarkdownSplitter(max_characters)
# splitter = MarkdownSplitter(max_characters, trim=False)
chunks = splitter.chunks("# Header\n\nyour document text", max_characters)
chunks = splitter.chunks("# Header\n\nyour document text")
```
### Using a Range for Chunk Capacity
Expand All @@ -275,11 +274,11 @@ class MarkdownSplitter:
```python
from semantic_text_splitter import MarkdownSplitter
splitter = MarkdownSplitter()
splitter = MarkdownSplitter(capacity=(200,1000))
# Maximum number of characters in a chunk. Will fill up the
# chunk until it is somewhere in this range.
chunks = splitter.chunks("# Header\n\nyour document text", chunk_capacity=(200,1000))
chunks = splitter.chunks("# Header\n\nyour document text")
```
### Using a Hugging Face Tokenizer
Expand All @@ -291,9 +290,9 @@ class MarkdownSplitter:
# Maximum number of tokens in a chunk
max_tokens = 1000
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
splitter = MarkdownSplitter.from_huggingface_tokenizer(tokenizer)
splitter = MarkdownSplitter.from_huggingface_tokenizer(tokenizer, max_tokens)
chunks = splitter.chunks("# Header\n\nyour document text", max_tokens)
chunks = splitter.chunks("# Header\n\nyour document text")
```
### Using a Tiktoken Tokenizer
Expand All @@ -304,9 +303,9 @@ class MarkdownSplitter:
# Maximum number of tokens in a chunk
max_tokens = 1000
splitter = MarkdownSplitter.from_tiktoken_model("gpt-3.5-turbo")
splitter = MarkdownSplitter.from_tiktoken_model("gpt-3.5-turbo"m max_tokens)
chunks = splitter.chunks("# Header\n\nyour document text", max_tokens)
chunks = splitter.chunks("# Header\n\nyour document text")
```
### Using a Custom Callback
Expand All @@ -315,51 +314,65 @@ class MarkdownSplitter:
from semantic_text_splitter import MarkdownSplitter
# Optionally can also have the splitter trim whitespace for you
splitter = MarkdownSplitter.from_callback(lambda text: len(text))
splitter = MarkdownSplitter.from_callback(lambda text: len(text), 1000)
# Maximum number of tokens in a chunk. Will fill up the
# chunk until it is somewhere in this range.
chunks = splitter.chunks("# Header\n\nyour document text", chunk_capacity=(200,1000))
chunks = splitter.chunks("# Header\n\nyour document text")
```
Args:
trim_chunks (bool, optional): Specify whether chunks should have whitespace trimmed from the
capacity (int | (int, int)): The capacity of characters in each chunk. If a
single int, then chunks will be filled up as much as possible, without going over
that number. If a tuple of two integers is provided, a chunk will be considered
"full" once it is within the two numbers (inclusive range). So it will only fill
up the chunk until the lower range is met.
trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
beginning and end or not. If False, joining all chunks will return the original
string. Indentation however will be preserved if the chunk also includes multiple lines.
Extra newlines are always removed, but if the text would include multiple indented list
items, the indentation of the first element will also be preserved.
Defaults to True.
string. Defaults to True.
"""

def __init__(self, trim_chunks: bool = True) -> None: ...
def __init__(
self, capacity: Union[int, Tuple[int, int]], trim: bool = True
) -> None: ...

@staticmethod
def from_huggingface_tokenizer(
tokenizer, trim_chunks: bool = True
tokenizer, capacity: Union[int, Tuple[int, int]], trim: bool = True
) -> MarkdownSplitter:
"""Instantiate a new markdown splitter from a Hugging Face Tokenizer instance.
Args:
tokenizer (Tokenizer): A `tokenizers.Tokenizer` you want to use to count tokens for each
chunk.
trim_chunks (bool, optional): Specify whether chunks should have whitespace trimmed from the
beginning and end or not. If False, joining all chunks will return the original
string. Defaults to True.
capacity (int | (int, int)): The capacity of characters in each chunk. If a
single int, then chunks will be filled up as much as possible, without going over
that number. If a tuple of two integers is provided, a chunk will be considered
"full" once it is within the two numbers (inclusive range). So it will only fill
up the chunk until the lower range is met.
trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
beginning and end or not. If False, joining all chunks will return the original
string. Defaults to True.
Returns:
The new markdown splitter
"""

@staticmethod
def from_huggingface_tokenizer_str(
json: str, trim_chunks: bool = True
json: str, capacity: Union[int, Tuple[int, int]], trim: bool = True
) -> MarkdownSplitter:
"""Instantiate a new markdown splitter from the given Hugging Face Tokenizer JSON string.
Args:
json (str): A valid JSON string representing a previously serialized
Hugging Face Tokenizer
trim_chunks (bool, optional): Specify whether chunks should have whitespace trimmed from the
capacity (int | (int, int)): The capacity of characters in each chunk. If a
single int, then chunks will be filled up as much as possible, without going over
that number. If a tuple of two integers is provided, a chunk will be considered
"full" once it is within the two numbers (inclusive range). So it will only fill
up the chunk until the lower range is met.
trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
beginning and end or not. If False, joining all chunks will return the original
string. Defaults to True.
Expand All @@ -369,29 +382,40 @@ class MarkdownSplitter:

@staticmethod
def from_huggingface_tokenizer_file(
path: str, trim_chunks: bool = True
path: str, capacity: Union[int, Tuple[int, int]], trim: bool = True
) -> MarkdownSplitter:
"""Instantiate a new markdown splitter from the Hugging Face tokenizer file at the given path.
Args:
path (str): A path to a local JSON file representing a previously serialized
Hugging Face tokenizer.
trim_chunks (bool, optional): Specify whether chunks should have whitespace trimmed from the
capacity (int | (int, int)): The capacity of characters in each chunk. If a
single int, then chunks will be filled up as much as possible, without going over
that number. If a tuple of two integers is provided, a chunk will be considered
"full" once it is within the two numbers (inclusive range). So it will only fill
up the chunk until the lower range is met.
trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
beginning and end or not. If False, joining all chunks will return the original
string. Defaults to True.
Returns:
The new markdown splitter
"""

@staticmethod
def from_tiktoken_model(model: str, trim_chunks: bool = True) -> MarkdownSplitter:
def from_tiktoken_model(
model: str, capacity: Union[int, Tuple[int, int]], trim: bool = True
) -> MarkdownSplitter:
"""Instantiate a new markdown splitter based on an OpenAI Tiktoken tokenizer.
Args:
model (str): The OpenAI model name you want to retrieve a tokenizer for.
trim_chunks (bool, optional): Specify whether chunks should have whitespace trimmed from the
capacity (int | (int, int)): The capacity of characters in each chunk. If a
single int, then chunks will be filled up as much as possible, without going over
that number. If a tuple of two integers is provided, a chunk will be considered
"full" once it is within the two numbers (inclusive range). So it will only fill
up the chunk until the lower range is met.
trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
beginning and end or not. If False, joining all chunks will return the original
string. Defaults to True.
Expand All @@ -401,24 +425,29 @@ class MarkdownSplitter:

@staticmethod
def from_callback(
callback: Callable[[str], int], trim_chunks: bool = True
callback: Callable[[str], int],
capacity: Union[int, Tuple[int, int]],
trim: bool = True,
) -> MarkdownSplitter:
"""Instantiate a new markdown splitter based on a custom callback.
Args:
callback (Callable[[str], int]): A lambda or other function that can be called. It will be
provided a piece of text, and it should return an integer value for the size.
trim_chunks (bool, optional): Specify whether chunks should have whitespace trimmed from the
capacity (int | (int, int)): The capacity of characters in each chunk. If a
single int, then chunks will be filled up as much as possible, without going over
that number. If a tuple of two integers is provided, a chunk will be considered
"full" once it is within the two numbers (inclusive range). So it will only fill
up the chunk until the lower range is met.
trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
beginning and end or not. If False, joining all chunks will return the original
string. Defaults to True.
Returns:
The new markdown splitter
"""

def chunks(
self, text: str, chunk_capacity: Union[int, Tuple[int, int]]
) -> List[str]:
def chunks(self, text: str) -> List[str]:
"""Generate a list of chunks from a given text. Each chunk will be up to the `chunk_capacity`.
## Method
Expand All @@ -445,31 +474,19 @@ class MarkdownSplitter:
Args:
text (str): Text to split.
chunk_capacity (int | (int, int)): The capacity of characters in each chunk. If a
single int, then chunks will be filled up as much as possible, without going over
that number. If a tuple of two integers is provided, a chunk will be considered
"full" once it is within the two numbers (inclusive range). So it will only fill
up the chunk until the lower range is met.
Returns:
A list of strings, one for each chunk. If `trim_chunks` was specified in the text
splitter, then each chunk will already be trimmed as well.
"""

def chunk_indices(
self, text: str, chunk_capacity: Union[int, Tuple[int, int]]
) -> List[Tuple[int, str]]:
def chunk_indices(self, text: str) -> List[Tuple[int, str]]:
"""Generate a list of chunks from a given text, along with their character offsets in the original text. Each chunk will be up to the `chunk_capacity`.
See `chunks` for more information.
Args:
text (str): Text to split.
chunk_capacity (int | (int, int)): The capacity of characters in each chunk. If a
single int, then chunks will be filled up as much as possible, without going over
that number. If a tuple of two integers is provided, a chunk will be considered
"full" once it is within the two numbers (inclusive range). So it will only fill
up the chunk until the lower range is met.
Returns:
A list of tuples, one for each chunk. The first item will be the character offset relative
Expand Down
Loading

0 comments on commit 981043d

Please sign in to comment.