Skip to content

Commit

Permalink
feat: add common voice dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
flavioschneider committed Aug 17, 2022
1 parent 8701581 commit 85ed396
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 16 deletions.
58 changes: 45 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ WAVDataset(
```

### LJSpeech Dataset
An unsupervised dataset for LJSpeech with voice only data.
An unsupervised dataset for LJSpeech with voice-only data.
```py
from audio_data_pytorch import LJSpeechDataset

Expand All @@ -44,19 +44,30 @@ dataset[0] # (1, 158621)
dataset[1] # (1, 153757)
```

### Common Voice Dataset
Multilanguage wrapper for the [Common Voice](https://commonvoice.mozilla.org/) dataset with voice-only data. Requires `pip install datasets`. Note that each language requires several GBs of storage.
```py
from audio_data_pytorch import CommonVoiceDataset

dataset = CommonVoiceDataset(root='./data')
```

#### Full API:
```py
LJSpeechDataset(
CommonVoiceDataset(
root: str = "./data", # The root where the dataset will be downloaded
with_sample_rate: bool = False, # Returns sample rate as second argument
languages: Sequence[str] = ['en'], # List of languages to include in the dataset
with_sample_rate: bool = False, # Returns sample rate as second argument
transforms: Optional[Callable] = None, # Transforms to apply to audio files
)
```

### Youtube Dataset
A wrapper around yt-dlp that automatically downloads the audio source of Youtube videos.
A wrapper around yt-dlp that automatically downloads the audio source of Youtube videos. Requires `pip install yt-dlp`.

```py
from audio_data_pytorch import YoutubeDataset

dataset = YoutubeDataset(
root='./data',
urls=[
Expand All @@ -82,18 +93,39 @@ dataset = YoutubeDataset(

## Transforms

An example
You can use the following individual transforms, or merge them with `nn.Sequential()`:

```py
from audio_data_pytorch import Resample, OverlapChannels, Crop, RandomCrop, Scale
from audio_data_pytorch import Crop
crop = Crop(size=22050*2, start=0) # Crop 2 seconds at 22050 Hz from the start of the file

crop = Crop(22050) # Crop start of audio track
from audio_data_pytorch import RandomCrop
crop = RandomCrop(size=22050*2) # Crop 2 seconds at 22050 Hz from a random position

transforms = nn.Sequential(
Resample(source=48000, target=22050), # Resample from 48kHz to 22kHz
OverlapChannels(), # Overap channels by sum (C, N) -> (1, N)
RandomCrop(22050 * 3), # Random crop from file
Scale(0.8) # Scale waveform
)
from audio_data_pytorch import Resample
resample = Resample(source=48000, target=22050), # Resamples from 48kHz to 22kHz

from audio_data_pytorch import OverlapChannels
overlap = OverlapChannels() # Overap channels by sum (C, N) -> (1, N)

from audio_data_pytorch import Scale
scale = Scale(scale=0.8) # Scale waveform amplitude by 0.8

from audio_data_pytorch import Loudness
scale = Loudness(sampling_rate=22050, target=-20) # Normalize loudness to -20dB, requires `pip install pyloudnorm`
```

Or use this wrapper to apply a subset of them in one go, API:
```py
from audio_data_pytorch import AllTransform

transform = AllTransform(
source_rate: Optional[int] = None,
target_rate: Optional[int] = None,
crop_size: Optional[int] = None,
random_crop_size: Optional[int] = None,
loudness: Optional[int] = None,
scale: Optional[float] = None,
overlap_channels: bool = False,
)
```
1 change: 1 addition & 0 deletions audio_data_pytorch/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .common_voice_dataset import CommonVoiceDataset
from .ljspeech_dataset import LJSpeechDataset
from .wav_dataset import WAVDataset
from .youtube_dataset import YoutubeDataset
38 changes: 38 additions & 0 deletions audio_data_pytorch/datasets/common_voice_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from typing import Callable, Optional, Sequence, Tuple, Union

import torch
from datasets import interleave_datasets, load_dataset
from torch import Tensor
from torch.utils.data import Dataset


class CommonVoiceDataset(Dataset):
def __init__(
self,
root: str = "./data",
languages: Sequence[str] = ["en"],
with_sample_rate: bool = False,
transforms: Optional[Callable] = None,
):
self.root = root
self.with_sample_rate = with_sample_rate
self.transforms = transforms

self.dataset = interleave_datasets(
[
load_dataset("common_voice", language, split="train", cache_dir=root)
for language in languages
]
)

def __getitem__(self, idx: int) -> Union[Tensor, Tuple[Tensor, Tensor]]:
data = self.dataset[idx]
waveform = torch.tensor(data["audio"]["array"]).view(1, -1)
sample_rate = data["audio"]["sampling_rate"]

if self.transforms:
waveform = self.transforms(waveform)
return (waveform, sample_rate) if self.with_sample_rate else waveform

def __len__(self) -> int:
return len(self.dataset)
4 changes: 2 additions & 2 deletions audio_data_pytorch/datasets/wav_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import glob
import os
from typing import Callable, List, Optional, Sequence, Union
from typing import Callable, List, Optional, Sequence, Tuple, Union

import torchaudio
from torch import Tensor
Expand Down Expand Up @@ -28,7 +28,7 @@ def __init__(
self.transforms = transforms
self.with_sample_rate = with_sample_rate

def __getitem__(self, idx: int) -> Tensor:
def __getitem__(self, idx: int) -> Union[Tensor, Tuple[Tensor, Tensor]]:
waveform, sample_rate = torchaudio.load(self.wavs[idx])
if self.transforms:
waveform = self.transforms(waveform)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
setup(
name="audio-data-pytorch",
packages=find_packages(exclude=[]),
version="0.0.2",
version="0.0.3",
license="MIT",
description="Audio Data - PyTorch",
long_description_content_type="text/markdown",
Expand Down

0 comments on commit 85ed396

Please sign in to comment.