From 49d4afe3db645bd7ae99519bcefd24120cc99ab9 Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Tue, 7 Jan 2025 03:31:58 +0530 Subject: [PATCH] [fix] Progress bar style to ' >=' --- src/chonkie/chunker/base.py | 29 ++++++++++++++++++----------- src/chonkie/chunker/token.py | 4 ++-- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/chonkie/chunker/base.py b/src/chonkie/chunker/base.py index 322adcd..35ab3ab 100644 --- a/src/chonkie/chunker/base.py +++ b/src/chonkie/chunker/base.py @@ -249,24 +249,31 @@ def _process_batch_sequential(self, desc="🦛 CHONKING", disable=not show_progress_bar, unit="texts", - bar_format="{desc}: |{bar:20}| {percentage:3.0f}% • {n_fmt}/{total_fmt} texts chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱", - ascii=' ▏▎▍▌▋▊▉' - ) - ] + bar_format="{desc}: [{bar:20}] {percentage:3.0f}% • {n_fmt}/{total_fmt} texts chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱", + ascii=' >=') + ] def _process_batch_multiprocessing(self, texts: List[str], show_progress_bar: bool = True) -> List[List[Chunk]]: """Process a batch of texts using multiprocessing.""" num_workers = self._determine_optimal_workers() + total = len(texts) + chunksize = max(1, min(total // (num_workers * 16), 10)) # Optimize chunk size + with Pool(processes=num_workers) as pool: - return list(tqdm(pool.imap(self.chunk, texts), - desc="🦛 CHONKING", - disable=not show_progress_bar, - unit="texts", - bar_format="{desc}: |{bar:20}| {percentage:3.0f}% • {n_fmt}/{total_fmt} texts chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱", - ascii=' ▏▎▍▌▋▊▉')) - + results = [] + with tqdm(total=total, + desc="🦛 CHONKING", + disable=not show_progress_bar, + unit="texts", + bar_format="{desc}: [{bar:20}] {percentage:3.0f}% • {n_fmt}/{total_fmt} texts chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱", + ascii=' >=') as pbar: + for result in pool.imap_unordered(self.chunk, texts, chunksize=chunksize): + results.append(result) + pbar.update() + return results + def chunk_batch( self, texts: List[str], diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py index 98391d6..3816402 100644 --- a/src/chonkie/chunker/token.py +++ b/src/chonkie/chunker/token.py @@ -194,8 +194,8 @@ def chunk_batch( desc="🦛 CHONKING", disable=not show_progress_bar, unit="batches", - ascii=" ▏▎▍▌▋▊▉", - bar_format="{desc}: |{bar:20}| {percentage:3.0f}% • {n_fmt}/{total_fmt} batches chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱"): + bar_format="{desc}: [{bar:20}] {percentage:3.0f}% • {n_fmt}/{total_fmt} batches chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱", + ascii=' >='): batch_texts = texts[i : min(i + batch_size, len(texts))] chunks.extend(self._process_text_batch(batch_texts)) return chunks