diff --git a/README.md b/README.md index b3a70ea..ee075b7 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ WAVDataset( ``` ### LJSpeech Dataset -An unsupervised dataset for LJSpeech with voice only data. +An unsupervised dataset for LJSpeech with voice-only data. ```py from audio_data_pytorch import LJSpeechDataset @@ -44,19 +44,30 @@ dataset[0] # (1, 158621) dataset[1] # (1, 153757) ``` +### Common Voice Dataset +Multilanguage wrapper for the [Common Voice](https://commonvoice.mozilla.org/) dataset with voice-only data. Requires `pip install datasets`. Note that each language requires several GBs of storage. +```py +from audio_data_pytorch import CommonVoiceDataset + +dataset = CommonVoiceDataset(root='./data') +``` + #### Full API: ```py -LJSpeechDataset( +CommonVoiceDataset( root: str = "./data", # The root where the dataset will be downloaded - with_sample_rate: bool = False, # Returns sample rate as second argument + languages: Sequence[str] = ['en'], # List of languages to include in the dataset + with_sample_rate: bool = False, # Returns sample rate as second argument transforms: Optional[Callable] = None, # Transforms to apply to audio files ) ``` ### Youtube Dataset -A wrapper around yt-dlp that automatically downloads the audio source of Youtube videos. +A wrapper around yt-dlp that automatically downloads the audio source of Youtube videos. Requires `pip install yt-dlp`. ```py +from audio_data_pytorch import YoutubeDataset + dataset = YoutubeDataset( root='./data', urls=[ @@ -82,18 +93,39 @@ dataset = YoutubeDataset( ## Transforms -An example +You can use the following individual transforms, or merge them with `nn.Sequential()`: ```py -from audio_data_pytorch import Resample, OverlapChannels, Crop, RandomCrop, Scale +from audio_data_pytorch import Crop +crop = Crop(size=22050*2, start=0) # Crop 2 seconds at 22050 Hz from the start of the file -crop = Crop(22050) # Crop start of audio track +from audio_data_pytorch import RandomCrop +crop = RandomCrop(size=22050*2) # Crop 2 seconds at 22050 Hz from a random position -transforms = nn.Sequential( - Resample(source=48000, target=22050), # Resample from 48kHz to 22kHz - OverlapChannels(), # Overap channels by sum (C, N) -> (1, N) - RandomCrop(22050 * 3), # Random crop from file - Scale(0.8) # Scale waveform -) +from audio_data_pytorch import Resample +resample = Resample(source=48000, target=22050), # Resamples from 48kHz to 22kHz +from audio_data_pytorch import OverlapChannels +overlap = OverlapChannels() # Overap channels by sum (C, N) -> (1, N) + +from audio_data_pytorch import Scale +scale = Scale(scale=0.8) # Scale waveform amplitude by 0.8 + +from audio_data_pytorch import Loudness +scale = Loudness(sampling_rate=22050, target=-20) # Normalize loudness to -20dB, requires `pip install pyloudnorm` +``` + +Or use this wrapper to apply a subset of them in one go, API: +```py +from audio_data_pytorch import AllTransform + +transform = AllTransform( + source_rate: Optional[int] = None, + target_rate: Optional[int] = None, + crop_size: Optional[int] = None, + random_crop_size: Optional[int] = None, + loudness: Optional[int] = None, + scale: Optional[float] = None, + overlap_channels: bool = False, +) ``` diff --git a/audio_data_pytorch/datasets/__init__.py b/audio_data_pytorch/datasets/__init__.py index 6e1eaf0..1f6b431 100644 --- a/audio_data_pytorch/datasets/__init__.py +++ b/audio_data_pytorch/datasets/__init__.py @@ -1,3 +1,4 @@ +from .common_voice_dataset import CommonVoiceDataset from .ljspeech_dataset import LJSpeechDataset from .wav_dataset import WAVDataset from .youtube_dataset import YoutubeDataset diff --git a/audio_data_pytorch/datasets/common_voice_dataset.py b/audio_data_pytorch/datasets/common_voice_dataset.py new file mode 100644 index 0000000..a82226f --- /dev/null +++ b/audio_data_pytorch/datasets/common_voice_dataset.py @@ -0,0 +1,38 @@ +from typing import Callable, Optional, Sequence, Tuple, Union + +import torch +from datasets import interleave_datasets, load_dataset +from torch import Tensor +from torch.utils.data import Dataset + + +class CommonVoiceDataset(Dataset): + def __init__( + self, + root: str = "./data", + languages: Sequence[str] = ["en"], + with_sample_rate: bool = False, + transforms: Optional[Callable] = None, + ): + self.root = root + self.with_sample_rate = with_sample_rate + self.transforms = transforms + + self.dataset = interleave_datasets( + [ + load_dataset("common_voice", language, split="train", cache_dir=root) + for language in languages + ] + ) + + def __getitem__(self, idx: int) -> Union[Tensor, Tuple[Tensor, Tensor]]: + data = self.dataset[idx] + waveform = torch.tensor(data["audio"]["array"]).view(1, -1) + sample_rate = data["audio"]["sampling_rate"] + + if self.transforms: + waveform = self.transforms(waveform) + return (waveform, sample_rate) if self.with_sample_rate else waveform + + def __len__(self) -> int: + return len(self.dataset) diff --git a/audio_data_pytorch/datasets/wav_dataset.py b/audio_data_pytorch/datasets/wav_dataset.py index 584e948..b7ee374 100644 --- a/audio_data_pytorch/datasets/wav_dataset.py +++ b/audio_data_pytorch/datasets/wav_dataset.py @@ -1,6 +1,6 @@ import glob import os -from typing import Callable, List, Optional, Sequence, Union +from typing import Callable, List, Optional, Sequence, Tuple, Union import torchaudio from torch import Tensor @@ -28,7 +28,7 @@ def __init__( self.transforms = transforms self.with_sample_rate = with_sample_rate - def __getitem__(self, idx: int) -> Tensor: + def __getitem__(self, idx: int) -> Union[Tensor, Tuple[Tensor, Tensor]]: waveform, sample_rate = torchaudio.load(self.wavs[idx]) if self.transforms: waveform = self.transforms(waveform) diff --git a/setup.py b/setup.py index 5891490..d763bcc 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="audio-data-pytorch", packages=find_packages(exclude=[]), - version="0.0.2", + version="0.0.3", license="MIT", description="Audio Data - PyTorch", long_description_content_type="text/markdown",