KevKibe · KevKibe · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/setup.py b/setup.py
@@ -71,7 +71,7 @@
     long_description_content_type="text/markdown",
     license = "MIT",
     python_requires=">=3.9",
-    # install_requires = BASE_DEPS,
+    install_requires = BASE_DEPS,
     extras_require={
         "all": ALL_DEPS,
         "training": BASE_DEPS,

diff --git a/src/tests/test_load_dataset.py b/src/tests/test_load_dataset.py
@@ -30,16 +30,28 @@ def test_load_dataset(self):
 
     def test_count_examples(self):
         """Test counting examples in a dataset."""
+
         # Arrange
         class MockDataset:
-            """A mock dataset class to simulate iteration."""
-            def __iter__(self):
-                return iter(range(10))
+            """A mock dataset class to simulate a dictionary-like structure with 'train' and 'test'."""
+
+            def __init__(self):
+                self.data = {
+                    "train": list(range(10)),  # Simulating 10 training examples
+                    "test": list(range(5))  # Simulating 5 testing examples
+                }
+
+            def __getitem__(self, key):
+                return self.data[key]
+
+        mock_dataset = MockDataset()
+
         # Act
-        count = self.dataset_manager.count_examples(MockDataset())
+        train_count, test_count = self.dataset_manager.count_examples(mock_dataset)
 
         # Assert
-        self.assertEqual(count, 10, "The count of examples should be equal to 10.")
+        self.assertEqual(train_count, 10, "The count of training examples should be equal to 10.")
+        self.assertEqual(test_count, 5, "The count of testing examples should be equal to 5.")
 
     def test_dataset_structure(self):
         """Test the structure of the loaded dataset."""

diff --git a/src/training/data_prep.py b/src/training/data_prep.py
@@ -139,8 +139,9 @@ def load_dataset(
             train_num_samples = train_num_samples,
             test_num_samples = test_num_samples
         )
-        print(f"Training dataset size: {self.data_loader.count_examples(dataset['train'])}")
-        print(f"Test dataset size: {self.data_loader.count_examples(dataset['test'])}")
+        train_count, test_count = self.data_loader.count_examples(dataset)
+        print(f"Training dataset size: {train_count}")
+        print(f"Test dataset size: {test_count}")
         processor = AudioDataProcessor(dataset, feature_extractor, tokenizer, processor)
         dataset['train']= dataset['train'].map(processor.resampled_dataset, remove_columns=list(next(iter(dataset['train'])).keys()))
         dataset['test']= dataset['test'].map(processor.resampled_dataset)

diff --git a/src/training/load_data.py b/src/training/load_data.py
@@ -1,7 +1,7 @@
-from datasets import load_dataset, IterableDataset, concatenate_datasets
+from datasets import load_dataset, IterableDatasetDict, concatenate_datasets
 import warnings
 from typing import List
-from datasets import IterableDatasetDict, DatasetDict
+from datasets import DatasetDict
 from huggingface_hub import HfFolder
 warnings.filterwarnings("ignore")
 
@@ -82,21 +82,23 @@ def load_dataset(self, streaming: bool = True, train_num_samples: int = None, te
 
         return data
 
-
-    def count_examples(self, dataset: IterableDataset) -> int:
+    @staticmethod
+    def count_examples(dataset: dict) -> tuple:
         """
         Count the number of examples in the dataset.
 
         Args:
             dataset (IterableDataset): The dataset to count examples from.
 
         Returns:
-            int: The number of examples in the dataset.
+            train_samples: The number of training examples in the dataset.
+            test_samples: The number of test examples in the dataset.
         """
-        count = 0
-        for _ in dataset:
-            count += 1
-        return count
+        train_samples = list(dataset["train"])
+        test_samples = list(dataset["test"])
+        train_samples = len(train_samples)
+        test_samples = len(test_samples)
+        return train_samples, test_samples
 
 
 def load_and_validate_ps_datasets(