feat: add common voice dataset

archinetai · Aug 17, 2022 · 85ed396 · 85ed396
1 parent 8701581
commit 85ed396
Show file tree

Hide file tree

Showing 5 changed files with 87 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ WAVDataset(
 ```
 
 ### LJSpeech Dataset
-An unsupervised dataset for LJSpeech with voice only data.
+An unsupervised dataset for LJSpeech with voice-only data.
 ```py
 from audio_data_pytorch import LJSpeechDataset
 
@@ -44,19 +44,30 @@ dataset[0] # (1, 158621)
 dataset[1] # (1, 153757)
 ```
 
+### Common Voice Dataset
+Multilanguage wrapper for the [Common Voice](https://commonvoice.mozilla.org/) dataset with voice-only data. Requires `pip install datasets`. Note that each language requires several GBs of storage.
+```py
+from audio_data_pytorch import CommonVoiceDataset
+
+dataset = CommonVoiceDataset(root='./data')
+```
+
 #### Full API:
 ```py
-LJSpeechDataset(
+CommonVoiceDataset(
     root: str = "./data", # The root where the dataset will be downloaded
-    with_sample_rate: bool = False, # Returns sample rate as second argument
+    languages: Sequence[str] = ['en'], # List of languages to include in the dataset
+    with_sample_rate: bool = False,  # Returns sample rate as second argument
     transforms: Optional[Callable] = None, # Transforms to apply to audio files
 )
 ```
 
 ### Youtube Dataset
-A wrapper around yt-dlp that automatically downloads the audio source of Youtube videos.
+A wrapper around yt-dlp that automatically downloads the audio source of Youtube videos. Requires `pip install yt-dlp`.
 
 ```py
+from audio_data_pytorch import YoutubeDataset
+
 dataset = YoutubeDataset(
     root='./data',
     urls=[
@@ -82,18 +93,39 @@ dataset = YoutubeDataset(
 
 ## Transforms
 
-An example
+You can use the following individual transforms, or merge them with `nn.Sequential()`:
 
 ```py
-from audio_data_pytorch import Resample, OverlapChannels, Crop, RandomCrop, Scale
+from audio_data_pytorch import Crop
+crop = Crop(size=22050*2, start=0) # Crop 2 seconds at 22050 Hz from the start of the file
 
-crop = Crop(22050) # Crop start of audio track
+from audio_data_pytorch import RandomCrop
+crop = RandomCrop(size=22050*2) # Crop 2 seconds at 22050 Hz from a random position
 
-transforms = nn.Sequential(
-    Resample(source=48000, target=22050), # Resample from 48kHz to 22kHz
-    OverlapChannels(), # Overap channels by sum (C, N) -> (1, N)
-    RandomCrop(22050 * 3), # Random crop from file
-    Scale(0.8) # Scale waveform
-)
+from audio_data_pytorch import Resample
+resample = Resample(source=48000, target=22050), # Resamples from 48kHz to 22kHz
 
+from audio_data_pytorch import OverlapChannels
+overlap = OverlapChannels() # Overap channels by sum (C, N) -> (1, N)
+
+from audio_data_pytorch import Scale
+scale = Scale(scale=0.8) # Scale waveform amplitude by 0.8
+
+from audio_data_pytorch import Loudness
+scale = Loudness(sampling_rate=22050, target=-20) # Normalize loudness to -20dB, requires `pip install pyloudnorm`
+```
+
+Or use this wrapper to apply a subset of them in one go, API:
+```py
+from audio_data_pytorch import AllTransform
+
+transform = AllTransform(
+    source_rate: Optional[int] = None,
+    target_rate: Optional[int] = None,
+    crop_size: Optional[int] = None,
+    random_crop_size: Optional[int] = None,
+    loudness: Optional[int] = None,
+    scale: Optional[float] = None,
+    overlap_channels: bool = False,
+)
 ```
diff --git a/audio_data_pytorch/datasets/__init__.py b/audio_data_pytorch/datasets/__init__.py
@@ -1,3 +1,4 @@
+from .common_voice_dataset import CommonVoiceDataset
 from .ljspeech_dataset import LJSpeechDataset
 from .wav_dataset import WAVDataset
 from .youtube_dataset import YoutubeDataset
diff --git a/audio_data_pytorch/datasets/common_voice_dataset.py b/audio_data_pytorch/datasets/common_voice_dataset.py
@@ -0,0 +1,38 @@
+from typing import Callable, Optional, Sequence, Tuple, Union
+
+import torch
+from datasets import interleave_datasets, load_dataset
+from torch import Tensor
+from torch.utils.data import Dataset
+
+
+class CommonVoiceDataset(Dataset):
+    def __init__(
+        self,
+        root: str = "./data",
+        languages: Sequence[str] = ["en"],
+        with_sample_rate: bool = False,
+        transforms: Optional[Callable] = None,
+    ):
+        self.root = root
+        self.with_sample_rate = with_sample_rate
+        self.transforms = transforms
+
+        self.dataset = interleave_datasets(
+            [
+                load_dataset("common_voice", language, split="train", cache_dir=root)
+                for language in languages
+            ]
+        )
+
+    def __getitem__(self, idx: int) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+        data = self.dataset[idx]
+        waveform = torch.tensor(data["audio"]["array"]).view(1, -1)
+        sample_rate = data["audio"]["sampling_rate"]
+
+        if self.transforms:
+            waveform = self.transforms(waveform)
+        return (waveform, sample_rate) if self.with_sample_rate else waveform
+
+    def __len__(self) -> int:
+        return len(self.dataset)
diff --git a/audio_data_pytorch/datasets/wav_dataset.py b/audio_data_pytorch/datasets/wav_dataset.py
@@ -1,6 +1,6 @@
 import glob
 import os
-from typing import Callable, List, Optional, Sequence, Union
+from typing import Callable, List, Optional, Sequence, Tuple, Union
 
 import torchaudio
 from torch import Tensor
@@ -28,7 +28,7 @@ def __init__(
         self.transforms = transforms
         self.with_sample_rate = with_sample_rate
 
-    def __getitem__(self, idx: int) -> Tensor:
+    def __getitem__(self, idx: int) -> Union[Tensor, Tuple[Tensor, Tensor]]:
         waveform, sample_rate = torchaudio.load(self.wavs[idx])
         if self.transforms:
             waveform = self.transforms(waveform)

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
     name="audio-data-pytorch",
     packages=find_packages(exclude=[]),
-    version="0.0.2",
+    version="0.0.3",
     license="MIT",
     description="Audio Data - PyTorch",
     long_description_content_type="text/markdown",