Use chunk config in Markdown as well

benbrandt · Apr 20, 2024 · 981043d · 981043d
1 parent 99cb125
commit 981043d
Show file tree

Hide file tree

Showing 11 changed files with 385 additions and 663 deletions.
diff --git a/README.md b/README.md
@@ -113,11 +113,9 @@ use text_splitter::MarkdownSplitter;
 let max_characters = 1000;
 // Default implementation uses character count for chunk size.
 // Can also use all of the same tokenizer implementations as `TextSplitter`.
-let splitter = MarkdownSplitter::default()
-    // Optionally can also have the splitter trim whitespace for you
-    .with_trim_chunks(true);
+let splitter = MarkdownSplitter::new(max_characters);
 
-let chunks = splitter.chunks("# Header\n\nyour document text", max_characters);
+let chunks = splitter.chunks("# Header\n\nyour document text");
 ```
 
 ## Method

diff --git a/benches/chunk_size.rs b/benches/chunk_size.rs
@@ -43,23 +43,23 @@ mod text {
 
     #[divan::bench(args = TEXT_FILENAMES, consts = CHUNK_SIZES)]
     fn characters<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
-        bench::<_, _>(bencher, filename, || TextSplitter::new(N));
+        bench(bencher, filename, || TextSplitter::new(N));
     }
 
     #[cfg(feature = "tiktoken-rs")]
     #[divan::bench(args = TEXT_FILENAMES, consts = CHUNK_SIZES)]
     fn tiktoken<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
         use text_splitter::ChunkConfig;
 
-        bench::<_, _>(bencher, filename, || {
+        bench(bencher, filename, || {
             TextSplitter::new(ChunkConfig::new(N).with_sizer(tiktoken_rs::cl100k_base().unwrap()))
         });
     }
 
     #[cfg(feature = "tokenizers")]
     #[divan::bench(args = TEXT_FILENAMES, consts = CHUNK_SIZES)]
     fn tokenizers<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
-        bench::<_, _>(bencher, filename, || {
+        bench(bencher, filename, || {
             TextSplitter::new(ChunkConfig::new(N).with_sizer(
                 tokenizers::Tokenizer::from_pretrained("bert-base-cased", None).unwrap(),
             ))
@@ -73,15 +73,15 @@ mod markdown {
     use std::fs;
 
     use divan::{black_box_drop, counter::BytesCount, Bencher};
-    use text_splitter::{ChunkSizer, MarkdownSplitter};
+    use text_splitter::{ChunkConfig, ChunkSizer, MarkdownSplitter};
 
     use crate::CHUNK_SIZES;
 
     const MARKDOWN_FILENAMES: &[&str] = &["commonmark_spec"];
 
-    fn bench<const N: usize, S, G>(bencher: Bencher<'_, '_>, filename: &str, gen_splitter: G)
+    fn bench<S, G>(bencher: Bencher<'_, '_>, filename: &str, gen_splitter: G)
     where
-        G: Fn() -> MarkdownSplitter<S> + Sync,
+        G: Fn() -> MarkdownSplitter<usize, S> + Sync,
         S: ChunkSizer,
     {
         bencher
@@ -93,30 +93,32 @@ mod markdown {
             })
             .input_counter(|(_, text)| BytesCount::of_str(text))
             .bench_values(|(splitter, text)| {
-                splitter.chunks(&text, N).for_each(black_box_drop);
+                splitter.chunks(&text).for_each(black_box_drop);
             });
     }
 
     #[divan::bench(args = MARKDOWN_FILENAMES, consts = CHUNK_SIZES)]
     fn characters<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
-        bench::<N, _, _>(bencher, filename, MarkdownSplitter::default);
+        bench(bencher, filename, || MarkdownSplitter::new(N));
     }
 
     #[cfg(feature = "tiktoken-rs")]
     #[divan::bench(args = MARKDOWN_FILENAMES, consts = CHUNK_SIZES)]
     fn tiktoken<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
-        bench::<N, _, _>(bencher, filename, || {
-            MarkdownSplitter::new(tiktoken_rs::cl100k_base().unwrap())
+        bench(bencher, filename, || {
+            MarkdownSplitter::new(
+                ChunkConfig::new(N).with_sizer(tiktoken_rs::cl100k_base().unwrap()),
+            )
         });
     }
 
     #[cfg(feature = "tokenizers")]
     #[divan::bench(args = MARKDOWN_FILENAMES, consts = CHUNK_SIZES)]
     fn tokenizers<const N: usize>(bencher: Bencher<'_, '_>, filename: &str) {
-        bench::<N, _, _>(bencher, filename, || {
-            MarkdownSplitter::new(
+        bench(bencher, filename, || {
+            MarkdownSplitter::new(ChunkConfig::new(N).with_sizer(
                 tokenizers::Tokenizer::from_pretrained("bert-base-cased", None).unwrap(),
-            )
+            ))
         });
     }
 }
diff --git a/bindings/python/semantic_text_splitter.pyi b/bindings/python/semantic_text_splitter.pyi
@@ -127,7 +127,7 @@ class TextSplitter:
                 up the chunk until the lower range is met.
             trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
                 beginning and end or not. If False, joining all chunks will return the original
-                string. Defaults to True..
+                string. Defaults to True.
 
         Returns:
             The new text splitter
@@ -149,8 +149,7 @@ class TextSplitter:
                 up the chunk until the lower range is met.
             trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
                 beginning and end or not. If False, joining all chunks will return the original
-                string. Defaults to True..
-
+                string. Defaults to True.
 
         Returns:
             The new text splitter
@@ -171,7 +170,7 @@ class TextSplitter:
                 up the chunk until the lower range is met.
             trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
                 beginning and end or not. If False, joining all chunks will return the original
-                string. Defaults to True..
+                string. Defaults to True.
 
         Returns:
             The new text splitter
@@ -258,10 +257,10 @@ class MarkdownSplitter:
     # Maximum number of characters in a chunk
     max_characters = 1000
     # Optionally can also have the splitter not trim whitespace for you
-    splitter = MarkdownSplitter()
-    # splitter = MarkdownSplitter(trim_chunks=False)
+    splitter = MarkdownSplitter(max_characters)
+    # splitter = MarkdownSplitter(max_characters, trim=False)
 
-    chunks = splitter.chunks("# Header\n\nyour document text", max_characters)
+    chunks = splitter.chunks("# Header\n\nyour document text")
     ```
 
     ### Using a Range for Chunk Capacity
@@ -275,11 +274,11 @@ class MarkdownSplitter:
     ```python
     from semantic_text_splitter import MarkdownSplitter
 
-    splitter = MarkdownSplitter()
+    splitter = MarkdownSplitter(capacity=(200,1000))
 
     # Maximum number of characters in a chunk. Will fill up the
     # chunk until it is somewhere in this range.
-    chunks = splitter.chunks("# Header\n\nyour document text", chunk_capacity=(200,1000))
+    chunks = splitter.chunks("# Header\n\nyour document text")
     ```
 
     ### Using a Hugging Face Tokenizer
@@ -291,9 +290,9 @@ class MarkdownSplitter:
     # Maximum number of tokens in a chunk
     max_tokens = 1000
     tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
-    splitter = MarkdownSplitter.from_huggingface_tokenizer(tokenizer)
+    splitter = MarkdownSplitter.from_huggingface_tokenizer(tokenizer, max_tokens)
 
-    chunks = splitter.chunks("# Header\n\nyour document text", max_tokens)
+    chunks = splitter.chunks("# Header\n\nyour document text")
     ```
 
     ### Using a Tiktoken Tokenizer
@@ -304,9 +303,9 @@ class MarkdownSplitter:
 
     # Maximum number of tokens in a chunk
     max_tokens = 1000
-    splitter = MarkdownSplitter.from_tiktoken_model("gpt-3.5-turbo")
+    splitter = MarkdownSplitter.from_tiktoken_model("gpt-3.5-turbo"m max_tokens)
 
-    chunks = splitter.chunks("# Header\n\nyour document text", max_tokens)
+    chunks = splitter.chunks("# Header\n\nyour document text")
     ```
 
     ### Using a Custom Callback
@@ -315,51 +314,65 @@ class MarkdownSplitter:
     from semantic_text_splitter import MarkdownSplitter
 
     # Optionally can also have the splitter trim whitespace for you
-    splitter = MarkdownSplitter.from_callback(lambda text: len(text))
+    splitter = MarkdownSplitter.from_callback(lambda text: len(text), 1000)
 
     # Maximum number of tokens in a chunk. Will fill up the
     # chunk until it is somewhere in this range.
-    chunks = splitter.chunks("# Header\n\nyour document text", chunk_capacity=(200,1000))
+    chunks = splitter.chunks("# Header\n\nyour document text")
     ```
 
     Args:
-        trim_chunks (bool, optional): Specify whether chunks should have whitespace trimmed from the
+        capacity (int | (int, int)): The capacity of characters in each chunk. If a
+            single int, then chunks will be filled up as much as possible, without going over
+            that number. If a tuple of two integers is provided, a chunk will be considered
+            "full" once it is within the two numbers (inclusive range). So it will only fill
+            up the chunk until the lower range is met.
+        trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
             beginning and end or not. If False, joining all chunks will return the original
-            string. Indentation however will be preserved if the chunk also includes multiple lines.
-            Extra newlines are always removed, but if the text would include multiple indented list
-            items, the indentation of the first element will also be preserved.
-            Defaults to True.
+            string. Defaults to True.
     """
 
-    def __init__(self, trim_chunks: bool = True) -> None: ...
+    def __init__(
+        self, capacity: Union[int, Tuple[int, int]], trim: bool = True
+    ) -> None: ...
 
     @staticmethod
     def from_huggingface_tokenizer(
-        tokenizer, trim_chunks: bool = True
+        tokenizer, capacity: Union[int, Tuple[int, int]], trim: bool = True
     ) -> MarkdownSplitter:
         """Instantiate a new markdown splitter from a Hugging Face Tokenizer instance.
 
         Args:
             tokenizer (Tokenizer): A `tokenizers.Tokenizer` you want to use to count tokens for each
                 chunk.
-            trim_chunks (bool, optional): Specify whether chunks should have whitespace trimmed from the
-                    beginning and end or not. If False, joining all chunks will return the original
-                    string. Defaults to True.
+            capacity (int | (int, int)): The capacity of characters in each chunk. If a
+                single int, then chunks will be filled up as much as possible, without going over
+                that number. If a tuple of two integers is provided, a chunk will be considered
+                "full" once it is within the two numbers (inclusive range). So it will only fill
+                up the chunk until the lower range is met.
+            trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
+                beginning and end or not. If False, joining all chunks will return the original
+                string. Defaults to True.
 
         Returns:
             The new markdown splitter
         """
 
     @staticmethod
     def from_huggingface_tokenizer_str(
-        json: str, trim_chunks: bool = True
+        json: str, capacity: Union[int, Tuple[int, int]], trim: bool = True
     ) -> MarkdownSplitter:
         """Instantiate a new markdown splitter from the given Hugging Face Tokenizer JSON string.
 
         Args:
             json (str): A valid JSON string representing a previously serialized
                 Hugging Face Tokenizer
-            trim_chunks (bool, optional): Specify whether chunks should have whitespace trimmed from the
+            capacity (int | (int, int)): The capacity of characters in each chunk. If a
+                single int, then chunks will be filled up as much as possible, without going over
+                that number. If a tuple of two integers is provided, a chunk will be considered
+                "full" once it is within the two numbers (inclusive range). So it will only fill
+                up the chunk until the lower range is met.
+            trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
                 beginning and end or not. If False, joining all chunks will return the original
                 string. Defaults to True.
 
@@ -369,29 +382,40 @@ class MarkdownSplitter:
 
     @staticmethod
     def from_huggingface_tokenizer_file(
-        path: str, trim_chunks: bool = True
+        path: str, capacity: Union[int, Tuple[int, int]], trim: bool = True
     ) -> MarkdownSplitter:
         """Instantiate a new markdown splitter from the Hugging Face tokenizer file at the given path.
 
         Args:
             path (str): A path to a local JSON file representing a previously serialized
                 Hugging Face tokenizer.
-            trim_chunks (bool, optional): Specify whether chunks should have whitespace trimmed from the
+            capacity (int | (int, int)): The capacity of characters in each chunk. If a
+                single int, then chunks will be filled up as much as possible, without going over
+                that number. If a tuple of two integers is provided, a chunk will be considered
+                "full" once it is within the two numbers (inclusive range). So it will only fill
+                up the chunk until the lower range is met.
+            trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
                 beginning and end or not. If False, joining all chunks will return the original
                 string. Defaults to True.
 
-
         Returns:
             The new markdown splitter
         """
 
     @staticmethod
-    def from_tiktoken_model(model: str, trim_chunks: bool = True) -> MarkdownSplitter:
+    def from_tiktoken_model(
+        model: str, capacity: Union[int, Tuple[int, int]], trim: bool = True
+    ) -> MarkdownSplitter:
         """Instantiate a new markdown splitter based on an OpenAI Tiktoken tokenizer.
 
         Args:
             model (str): The OpenAI model name you want to retrieve a tokenizer for.
-            trim_chunks (bool, optional): Specify whether chunks should have whitespace trimmed from the
+            capacity (int | (int, int)): The capacity of characters in each chunk. If a
+                single int, then chunks will be filled up as much as possible, without going over
+                that number. If a tuple of two integers is provided, a chunk will be considered
+                "full" once it is within the two numbers (inclusive range). So it will only fill
+                up the chunk until the lower range is met.
+            trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
                 beginning and end or not. If False, joining all chunks will return the original
                 string. Defaults to True.
 
@@ -401,24 +425,29 @@ class MarkdownSplitter:
 
     @staticmethod
     def from_callback(
-        callback: Callable[[str], int], trim_chunks: bool = True
+        callback: Callable[[str], int],
+        capacity: Union[int, Tuple[int, int]],
+        trim: bool = True,
     ) -> MarkdownSplitter:
         """Instantiate a new markdown splitter based on a custom callback.
 
         Args:
             callback (Callable[[str], int]): A lambda or other function that can be called. It will be
                 provided a piece of text, and it should return an integer value for the size.
-            trim_chunks (bool, optional): Specify whether chunks should have whitespace trimmed from the
+            capacity (int | (int, int)): The capacity of characters in each chunk. If a
+                single int, then chunks will be filled up as much as possible, without going over
+                that number. If a tuple of two integers is provided, a chunk will be considered
+                "full" once it is within the two numbers (inclusive range). So it will only fill
+                up the chunk until the lower range is met.
+            trim (bool, optional): Specify whether chunks should have whitespace trimmed from the
                 beginning and end or not. If False, joining all chunks will return the original
                 string. Defaults to True.
 
         Returns:
             The new markdown splitter
         """
 
-    def chunks(
-        self, text: str, chunk_capacity: Union[int, Tuple[int, int]]
-    ) -> List[str]:
+    def chunks(self, text: str) -> List[str]:
         """Generate a list of chunks from a given text. Each chunk will be up to the `chunk_capacity`.
 
         ## Method
@@ -445,31 +474,19 @@ class MarkdownSplitter:
 
         Args:
             text (str): Text to split.
-            chunk_capacity (int | (int, int)): The capacity of characters in each chunk. If a
-                single int, then chunks will be filled up as much as possible, without going over
-                that number. If a tuple of two integers is provided, a chunk will be considered
-                "full" once it is within the two numbers (inclusive range). So it will only fill
-                up the chunk until the lower range is met.
 
         Returns:
             A list of strings, one for each chunk. If `trim_chunks` was specified in the text
             splitter, then each chunk will already be trimmed as well.
         """
 
-    def chunk_indices(
-        self, text: str, chunk_capacity: Union[int, Tuple[int, int]]
-    ) -> List[Tuple[int, str]]:
+    def chunk_indices(self, text: str) -> List[Tuple[int, str]]:
         """Generate a list of chunks from a given text, along with their character offsets in the original text. Each chunk will be up to the `chunk_capacity`.
 
         See `chunks` for more information.
 
         Args:
             text (str): Text to split.
-            chunk_capacity (int | (int, int)): The capacity of characters in each chunk. If a
-                single int, then chunks will be filled up as much as possible, without going over
-                that number. If a tuple of two integers is provided, a chunk will be considered
-                "full" once it is within the two numbers (inclusive range). So it will only fill
-                up the chunk until the lower range is met.
 
         Returns:
             A list of tuples, one for each chunk. The first item will be the character offset relative