[fix] Make multiprocessing optional for inherited chunkers

chonkie-ai · Jan 6, 2025 · ba7caae · ba7caae
1 parent e0c67f9
commit ba7caae
Show file tree

Hide file tree

Showing 5 changed files with 91 additions and 16 deletions.
diff --git a/src/chonkie/chunker/base.py b/src/chonkie/chunker/base.py
@@ -34,6 +34,9 @@ def __init__(
             self.tokenizer = tokenizer_or_token_counter
             self._tokenizer_backend = self._get_tokenizer_backend()
             self.token_counter = self._get_tokenizer_counter()
+
+        # Set whether to use multiprocessing or not
+        self._use_multiprocessing = True
 
     def _get_tokenizer_backend(self):
         """Return the backend tokenizer object."""
@@ -235,33 +238,65 @@ def _determine_optimal_workers(self) -> int:
                 f"Error determining optimal workers: {e}. Using single process."
             )
             return 1
-
+
+    def _process_batch_sequential(self,
+                                  texts: List[str],
+                                  show_progress_bar: bool = True) -> List[List[Chunk]]:
+        """Process a batch of texts sequentially."""
+        return [
+                self.chunk(t) for t in tqdm(
+                    texts,
+                    desc="🦛 CHONKING",
+                    disable=not show_progress_bar,
+                    unit="texts",
+                    bar_format="{desc}: |{bar:20}| {percentage:3.0f}% • {n_fmt}/{total_fmt} texts chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
+                    ascii=' ▏▎▍▌▋▊▉'
+                )
+            ]
+
+    def _process_batch_multiprocessing(self,
+                                     texts: List[str],
+                                     show_progress_bar: bool = True) -> List[List[Chunk]]:
+        """Process a batch of texts using multiprocessing."""
+        num_workers = self._determine_optimal_workers()
+        with Pool(processes=num_workers) as pool:
+            return list(tqdm(pool.imap(self.chunk, texts),
+                             desc="🦛 CHONKING",
+                             disable=not show_progress_bar,
+                             unit="texts",
+                             bar_format="{desc}: |{bar:20}| {percentage:3.0f}% • {n_fmt}/{total_fmt} texts chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
+                             ascii=' ▏▎▍▌▋▊▉'))
+
     def chunk_batch(
         self,
-        text: List[str],
+        texts: List[str],
         show_progress_bar: bool = True,
     ) -> List[List[Chunk]]:
         """Split a List of texts into their respective chunks.
 
         By default, this method uses multiprocessing to parallelize the chunking process.
 
         Args:
-            text: List of input texts to be chunked.
+            texts: List of input texts to be chunked.
             show_progress_bar: Whether to show a progress bar.
         
         Returns:
             List of lists of Chunk objects containing the chunked text and metadata
 
         """
-        return [self.chunk(t) for t in tqdm(text, desc="Chunking Texts", disable=not show_progress_bar)]
+        if self._use_multiprocessing:
+            return self._process_batch_multiprocessing(texts, show_progress_bar)
+        else:
+            return self._process_batch_sequential(texts, show_progress_bar)
 
     def __call__(
-        self, text: Union[str, List[str]]
+        self, text: Union[str, List[str]], show_progress_bar: bool = True
     ) -> Union[List[Chunk], List[List[Chunk]]]:
         """Make the chunker callable directly.
 
         Args:
             text: Input text or list of texts to be chunked
+            show_progress_bar: Whether to show a progress bar (for batch chunking)
 
         Returns:
             List of Chunk objects or list of lists of Chunk
@@ -270,7 +305,7 @@ def __call__(
         if isinstance(text, str):
             return self.chunk(text)
         elif isinstance(text, list):
-            return self.chunk_batch(text)
+            return self.chunk_batch(text, show_progress_bar)
         else:
             raise ValueError("Input must be a string or a list of strings.")
 

diff --git a/src/chonkie/chunker/late.py b/src/chonkie/chunker/late.py
@@ -102,6 +102,9 @@ def __init__(self,
         # for the semantic meaning to be calculated properly
         super().__init__(self.embedding_model.get_tokenizer_or_token_counter())
 
+        # Remove the multiprocessing flag from the base class
+        self._use_multiprocessing = False
+
     def _create_token_chunks(self,
                             chunk_texts: List[str],
                             token_counts: List[int],

diff --git a/src/chonkie/chunker/sdpm.py b/src/chonkie/chunker/sdpm.py
@@ -76,6 +76,9 @@ def __init__(
         )
         self.skip_window = skip_window
 
+        # Remove the multiprocessing flag from the base class
+        self._use_multiprocessing = False
+
     def _merge_groups(self, groups: List[List[Sentence]]) -> List[Sentence]:
         """Merge the groups together."""
         merged_group = []

diff --git a/src/chonkie/chunker/semantic.py b/src/chonkie/chunker/semantic.py
@@ -129,6 +129,9 @@ def __init__(
         # for the group semantic meaning to be calculated properly
         super().__init__(self.embedding_model.get_tokenizer_or_token_counter())
 
+        # Remove the multiprocessing flag from the base class
+        self._use_multiprocessing = False
+
     def _split_sentences(
         self,
         text: str,

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
@@ -6,7 +6,7 @@
 
 from .base import BaseChunker
 
-
+from tqdm import trange
 class TokenChunker(BaseChunker):
     """Chunker that splits text into chunks of a specified token size.
 
@@ -48,6 +48,8 @@ def __init__(
             if isinstance(chunk_overlap, int)
             else int(chunk_overlap * chunk_size)
         )
+
+        self._use_multiprocessing = False
 
     def _create_chunks(
         self,
@@ -169,27 +171,56 @@ def _process_text_batch(self, texts: List[str]) -> List[List[Chunk]]:
         return result
 
     def chunk_batch(
-        self, texts: List[str], batch_size: Union[int, None] = None
+        self,
+        texts: List[str],
+        batch_size: int = 1,
+        show_progress_bar: bool = True
     ) -> List[List[Chunk]]:
         """Split a batch of texts into their respective chunks.
 
         Args:
             texts: List of input texts to be chunked
             batch_size: Number of texts to process in a single batch
+            show_progress_bar: Whether to show a progress bar
 
         Returns:
             List of lists of Chunk objects containing the chunked text and metadata
 
         """
-        # if batch_size is not None, we process the texts in mini-batches to avoid memory issues
-        if batch_size is not None:
-            chunks = []
-            for i in range(0, len(texts), batch_size):
-                batch_texts = texts[i : min(i + batch_size, len(texts))]
-                chunks.extend(self._process_text_batch(batch_texts))
-            return chunks
+        chunks = []
+        for i in trange(0,
+                        len(texts),
+                        batch_size,
+                        desc="🦛 CHONKING",
+                        disable=not show_progress_bar, 
+                        unit="batches",
+                        ascii=" ▏▎▍▌▋▊▉",
+                        bar_format="{desc}: |{bar:20}| {percentage:3.0f}% • {n_fmt}/{total_fmt} batches chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱"):
+            batch_texts = texts[i : min(i + batch_size, len(texts))]
+            chunks.extend(self._process_text_batch(batch_texts))
+        return chunks
+
+    def __call__(self,
+                text: Union[str, List[str]],
+                batch_size: int = 1,
+                show_progress_bar: bool = True) -> Union[List[Chunk], List[List[Chunk]]]:
+        """Make the TokenChunker callable directly.
+        
+        Args:
+            text: Input text or list of texts to be chunked
+            batch_size: Number of texts to process in a single batch
+            show_progress_bar: Whether to show a progress bar (for batch chunking)
+        
+        Returns:
+            List of Chunk objects or list of lists of Chunk
+
+        """
+        if isinstance(text, str):
+            return self.chunk(text)
+        elif isinstance(text, list) and isinstance(text[0], str):
+            return self.chunk_batch(text, batch_size, show_progress_bar)
         else:
-            return self._process_text_batch(texts)
+            raise ValueError("Invalid input type. Expected a string or a list of strings.")
 
     def __repr__(self) -> str:
         """Return a string representation of the TokenChunker."""