KevKibe · KevKibe · Sep 24, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/.github/workflows/deployment.speech_inference_tests.yaml b/.github/workflows/deployment.speech_inference_tests.yaml
@@ -42,7 +42,6 @@ jobs:
 
       - name: Run tests
         env:
-          HF_READ_TOKEN: ${{ secrets.HF_READ_TOKEN }}
-          HF_WRITE_TOKEN: ${{ secrets.HF_WRITE_TOKEN }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           WANDB_TOKEN: ${{ secrets.WANDB_TOKEN }}
-        run: pytest src/tests/test_model_optimization.py src/tests/test_transcription_pipeline.py
+        run: pytest -vv src/tests/test_model_optimization.py src/tests/test_transcription_pipeline.py
diff --git a/.github/workflows/training.model_prep_test.yaml b/.github/workflows/training.model_prep_test.yaml
@@ -1,4 +1,4 @@
-name: Test training.model_trainer Module.
+name: Test training.model_prep Module.
 
 on: [pull_request]
 
@@ -42,7 +42,6 @@ jobs:
 
       - name: Run tests
         env:
-            HF_READ_TOKEN: ${{ secrets.HF_READ_TOKEN }}
-            HF_WRITE_TOKEN: ${{ secrets.HF_WRITE_TOKEN }}
+            HF_TOKEN: ${{ secrets.HF_TOKEN }}
             WANDB_TOKEN: ${{ secrets.WANDB_TOKEN }}
-        run: pytest src/tests/test_model_prep.py
+        run: pytest -vv src/tests/test_model_prep.py
diff --git a/.github/workflows/training.model_trainer_tests.yaml b/.github/workflows/training.model_trainer_tests.yaml
@@ -42,7 +42,6 @@ jobs:
 
       - name: Run tests
         env:
-          HF_READ_TOKEN: ${{ secrets.HF_READ_TOKEN }}
-          HF_WRITE_TOKEN: ${{ secrets.HF_WRITE_TOKEN }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           WANDB_TOKEN: ${{ secrets.WANDB_TOKEN }}
-        run: pytest src/tests/test_model_trainer.py
+        run: pytest -vv src/tests/test_model_trainer.py
diff --git a/.github/workflows/training_tests.yaml b/.github/workflows/training_tests.yaml
@@ -1,4 +1,4 @@
-name: Test Data and Model Prep Modules
+name: Test Data Loading and Processing Modules
 
 on: [pull_request]
 
@@ -42,7 +42,6 @@ jobs:
 
       - name: Run tests
         env:
-          HF_READ_TOKEN: ${{ secrets.HF_READ_TOKEN }}
-          HF_WRITE_TOKEN: ${{ secrets.HF_WRITE_TOKEN }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           WANDB_TOKEN: ${{ secrets.WANDB_TOKEN }}
-        run: pytest src/tests/test_audio_processor.py src/tests/test_data_prep.py src/tests/test_load_dataset.py 
+        run: pytest -vv src/tests/test_audio_processor.py src/tests/test_data_prep.py src/tests/test_load_dataset.py
diff --git a/DOCS/gettingstarted.md b/DOCS/gettingstarted.md
@@ -4,6 +4,7 @@
 ## Usage Demo on Colab(v0.9.12)
 - Refer to documentation below for updated instructions and guides.
 <iframe width="560" height="315" src="https://www.youtube.com/embed/NHSV8ZyhMVA?si=6217bgwGGUavm-Nq" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
+
 ## Prerequisites
 
 - Sign up to HuggingFace and get your token keys use this [guide](https://huggingface.co/docs/hub/en/security-tokens).
@@ -66,6 +67,7 @@ processed_dataset = process.load_dataset(
     feature_extractor=feature_extractor,
     tokenizer=tokenizer,
     processor=feature_processor,
+    streaming=True,
     train_num_samples = None,     # Optional: int - Number of samples to load into training dataset, default the whole training set.
     test_num_samples = None )     # Optional: int - Number of samples to load into test dataset, default the whole test set.
                                   # Set None to load the entire dataset
@@ -112,7 +114,7 @@ trainer.train(
 from training.merge_lora import Merger
 
 # Merge PEFT fine-tuned model weights with the base model weights
-Merger.merge_lora_weights(hf_model_id="your-finetuned-model-name-on-huggingface-hub", huggingface_write_token = " ")
+Merger.merge_lora_weights(hf_model_id="your-finetuned-model-name-on-huggingface-hub", huggingface_token = " ")
 ```
 
 ## Step 7: Test Model using an Audio File

diff --git a/DOCS/troubleshoot.md b/DOCS/troubleshoot.md
@@ -1,10 +1,18 @@
 ## Troubleshooting Tips
 
-- If you encounter trouble installing `africanwhisper` package on Kaggle, see: <br>
-[Issue #142](https://github.com/KevKibe/African-Whisper/issues/142)
+- If you encounter trouble installing `africanwhisper` package on Kaggle, and encounter the error:
+```commandline
+ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: '/opt/conda/lib/python3.10/site-packages/aiohttp-3.9.1.dist-info/METADATA'
+```
+Execute this command before installing the package:
+```commandline
+!rm /opt/conda/lib/python3.10/site-packages/aiohttp-3.9.1.dist-info -rdf
+```
+see [Issue #142](https://github.com/KevKibe/African-Whisper/issues/142) for more info.
+
 
 - If you encounter this error installing `africanwhisper` package on Colab:
-```
+```commandline
 ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
 spacy 3.7.4 requires typer<0.10.0,>=0.3.0, but you have typer 0.12.3 which is incompatible.
 torchtext 0.18.0 requires torch>=2.3.0, but you have torch 2.2.2 which is incompatible.
@@ -15,4 +23,11 @@ WARNING: The following packages were previously imported in this runtime:
   [pydevd_plugins]
 You must restart the runtime in order to use newly installed versions.
 ```
-- Restart the kernel and continue with the next step.
+restart the kernel and continue with the next step.
+
+- If you encounter the error:
+```commandline
+TypeError: expected string or bytes-like object
+```
+upgrade `pandas` version to `2.2.2` and restart kernel 
+
diff --git a/requirements.txt b/requirements.txt
@@ -20,8 +20,9 @@ faster-whisper==1.0.3
 python-dotenv==1.0.1
 pyannote.audio==3.2.0
 nltk==3.8.1
-torchvision==0.17.2
+torchvision
 ctranslate2==4.3.1
-pandas==2.0.3
+pandas==2.2.2
 fastapi==0.111.0
 uvicorn==0.30.1
+tqdm
diff --git a/setup.py b/setup.py
@@ -25,13 +25,16 @@
     "python-dotenv==1.0.1",
     "pyannote-audio==3.2.0",
     "nltk==3.8.1",
-    "torchvision==0.17.2",
+    "torchvision",
     "ctranslate2==4.3.1",
-    "pandas==2.0.3",
+    "pandas==2.2.2",
+    "huggingface_hub",
+    "soundfile",
+    "tqdm"
 ]
 
 DEPLOYMENT_DEPS = [
-    "torch==2.3.1",
+    "torch",
     "transformers==4.42.3",
     "pydantic==2.7.3",
     "prometheus-client==0.20.0",
@@ -41,15 +44,15 @@
     "faster-whisper==1.0.3",
     "pyannote-audio==3.2.0",
     "nltk==3.8.1",
-    "torchvision==0.17.2",
+    "torchvision",
     "ctranslate2==4.3.1",
-    "pandas==2.2.1",
+    "pandas==2.2.2",
 ]
 ALL_DEPS = BASE_DEPS + DEPLOYMENT_DEPS
 
 setup(
     name="africanwhisper",
-    version="0.9.12",
+    version="0.9.13",
     author="Kevin Kibe",
     author_email="[email protected]",
     package_dir={"": "src"},

diff --git a/src/deployment/faster_whisper/load_asr_model.py b/src/deployment/faster_whisper/load_asr_model.py
@@ -46,7 +46,7 @@ def load_asr_model(whisper_arch,
         print("No language specified, language will be first be detected for each audio file (increases inference time).")
         tokenizer = None
 
-    default_asr_options =  {
+    default_asr_options =  { # explore temperature_increment_on_fallback parameter
         "beam_size": 5,
         "best_of": 5,
         "patience": 1,
@@ -57,15 +57,15 @@ def load_asr_model(whisper_arch,
         "compression_ratio_threshold": 2.4,
         "log_prob_threshold": -1.0,
         "no_speech_threshold": 0.6,
-        "condition_on_previous_text": False,
+        "condition_on_previous_text": False, # explore True
         "prompt_reset_on_temperature": 0.5,
         "initial_prompt": None,
         "prefix": None,
         "suppress_blank": True,
         "suppress_tokens": [-1],
         "without_timestamps": True,
         "max_initial_timestamp": 0.0,
-        "word_timestamps": False,
+        "word_timestamps": False, # Explore True
         "prepend_punctuations": "\"'“¿([{-",
         "append_punctuations": "\"'.。,，!！?？:：”)]}、",
         "suppress_numerals": False,

diff --git a/src/deployment/requirements.txt b/src/deployment/requirements.txt
@@ -8,7 +8,7 @@ python-dotenv==1.0.1
 faster-whisper==1.0.3
 pyannote.audio==3.1.1
 nltk==3.8.1
-torchvision==0.17.2
+torchvision
 ctranslate2==4.1.0
-pandas==2.2.1
+pandas==2.2.2
 python-multipart==0.0.9
diff --git a/src/deployment/speech_inference.py b/src/deployment/speech_inference.py
@@ -40,15 +40,15 @@ def convert_model_to_optimized_format(self) -> None:
         else:
             print(f"Model {self.model_name} is already in CTranslate2 format")
 
-    def load_transcription_model(self) -> object:
+    def load_transcription_model(self, beam_size: int = 5, language = None) -> object:
         """
         Loads the ASR model for transcription.
 
         Returns:
             object: Loaded ASR model.
         """
         asr_options = {
-            "beam_size": 5,
+            "beam_size": beam_size,
             "patience": 1.0,
             "length_penalty": 1.0,
             "temperatures": 0,
@@ -61,14 +61,15 @@ def load_transcription_model(self) -> object:
             "suppress_numerals": True,
         }
         model_dir = None
-        compute_type = "bfloat16" if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else "float32"
+        # compute_type = "bfloat16" if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else "float16"
+        compute_type = "float16" if torch.cuda.is_available() else "float32"
         model = load_asr_model(
             whisper_arch = self.model_name,
             device=self.device,
-            device_index=0,
+            device_index=0, #for multi-gpu processing
             download_root=model_dir,
             compute_type=compute_type,
-            language=None,
+            language=language,
             asr_options=asr_options,
             vad_options={"vad_onset": 0.500, "vad_offset": 0.363},
             threads=8
@@ -88,7 +89,6 @@ class SpeechTranscriptionPipeline:
         batch_size (int): Number of audio segments to process per batch.
         chunk_size (int): Duration of each audio chunk for processing.
         huggingface_token (str): Read token for accessing Huggingface API.
-        model_name (str): Name of the model to be used for transcription.
     """
     def __init__(self,
                  audio_file_path: str,
@@ -101,7 +101,7 @@ def __init__(self,
         self.device = 0 if torch.cuda.is_available() else "cpu"
         self.batch_size = batch_size
         self.chunk_size = chunk_size
-        self.huggingface_token = huggingface_token
+        self.huggingface_token = huggingface_token,
 
 
     def transcribe_audio(self, model) -> Dict:
@@ -155,21 +155,23 @@ def align_transcription(self, transcription_result: Dict, alignment_model: str =
 
     def diarize_audio(self,
                       alignment_result: Dict,
+                      num_speakers: int = 1,
                       min_speakers: int = 1,
                       max_speakers: int = 3) -> Dict:
         """
         Diarizes the audio and assigns speakers to each segment.
 
         Args:
             alignment_result (Dict): Alignment result to be diarized.
+            num_speakers (int, optional): Number of speakers. Defaults to 1.
             min_speakers (int, optional): Minimum number of speakers. Defaults to 1.
             max_speakers (int, optional): Maximum number of speakers. Defaults to 3.
 
         Returns:
             Dict: Diarization result with speakers assigned to segments.
         """
         diarize_model = DiarizationPipeline(token=self.huggingface_token, device=self.device)
-        diarize_segments = diarize_model(self.audio, min_speakers=min_speakers, max_speakers=max_speakers)
+        diarize_segments = diarize_model(self.audio, num_speakers = num_speakers, min_speakers=min_speakers, max_speakers=max_speakers)
         diarization_result = assign_word_speakers(diarize_segments, alignment_result)
         return diarization_result
 

diff --git a/src/tests/test_audio_processor.py b/src/tests/test_audio_processor.py
@@ -17,20 +17,29 @@ def setUp(self):
         """
         # Load dataset
         self.data_loader = Dataset(
-            huggingface_token = os.environ.get("HF_WRITE_TOKEN"),
+            huggingface_token = os.environ.get("HF_TOKEN"),
             dataset_name="mozilla-foundation/common_voice_16_1",
-            language_abbr=["yi", "ti"]
+            language_abbr=["af"]
         )
-        self.dataset = self.data_loader.load_dataset(train_num_samples=10, test_num_samples=10)
-        has_train_sample = any(True for _ in self.dataset["train"])
+        self.dataset_streaming = self.data_loader.load_dataset(streaming=True, train_num_samples=10, test_num_samples=10)
+        self.dataset_batch = self.data_loader.load_dataset(streaming=False, train_num_samples=10, test_num_samples=10)
+
+        has_train_sample = any(True for _ in self.dataset_streaming["train"])
+        assert has_train_sample, "Train dataset is empty!"
+
+        has_test_sample = any(True for _ in self.dataset_streaming["test"])
+        assert has_test_sample, "Test dataset is empty!"
+
+        has_train_sample = any(True for _ in self.dataset_batch["train"])
         assert has_train_sample, "Train dataset is empty!"
 
-        has_test_sample = any(True for _ in self.dataset["test"])
+        has_test_sample = any(True for _ in self.dataset_batch["test"])
         assert has_test_sample, "Test dataset is empty!"
 
         # Initialize model preparation
         self.model_prep = WhisperModelPrep(
-            model_id="openai/whisper-small",
+            language = ["af"],
+            model_id="openai/whisper-tiny",
             processing_task="transcribe",
             use_peft=False
         )
@@ -42,7 +51,14 @@ def setUp(self):
 
         # Initialize AudioDataProcessor
         self.processor = AudioDataProcessor(
-            dataset=self.dataset,
+            dataset=self.dataset_streaming,
+            feature_extractor=self.feature_extractor,
+            tokenizer=self.tokenizer,
+            feature_processor=self.feature_processor
+        )
+
+        self.processor_batch = AudioDataProcessor(
+            dataset=self.dataset_batch,
             feature_extractor=self.feature_extractor,
             tokenizer=self.tokenizer,
             feature_processor=self.feature_processor
@@ -53,7 +69,7 @@ def test_resampled_dataset(self):
         Test the resampled_dataset method.
         """
         # Arrange
-        sample_dataset = self.dataset
+        sample_dataset = self.dataset_streaming
 
         # Act & Assert
         for split, samples in sample_dataset.items():
@@ -63,5 +79,20 @@ def test_resampled_dataset(self):
                 self.assertIn("labels", resampled_data)
                 self.assertEqual(resampled_data["audio"]["sampling_rate"], 16000)
 
+    def test_resampled_dataset_batch(self):
+        """
+        Test the resampled_dataset method.
+        """
+        # Arrange
+        sample_dataset = self.dataset_batch
+
+        # Act & Assert
+        for split, samples in sample_dataset.items():
+            for sample in samples:
+                resampled_data = self.processor_batch.resampled_dataset(sample)
+                self.assertIn("input_features", resampled_data)
+                self.assertIn("labels", resampled_data)
+                self.assertEqual(resampled_data["audio"]["sampling_rate"], 16000)
+
 if __name__ == '__main__':
     unittest.main()