feat: added ljspeech, youtube, and transforms

archinetai · Jul 25, 2022 · 39993db · 39993db
commit 39993db
Showing 12 changed files with 559 additions and 0 deletions.
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -0,0 +1,39 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Upload Python Package
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+    - name: Build package
+      run: python -m build
+    - name: Publish package
+      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+.mypy_cache
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,40 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+
+# Formats code correctly
+-   repo: https://github.com/psf/black
+    rev: 21.12b0
+    hooks:
+    -   id: black
+        args: [
+            '--experimental-string-processing'
+        ]
+
+# Sorts imports
+-   repo: https://github.com/pycqa/isort
+    rev: 5.10.1
+    hooks:
+    -   id: isort
+        name: isort (python)
+
+# Checks unused imports, like lengths, etc
+-   repo: https://gitlab.com/pycqa/flake8
+    rev: 4.0.0
+    hooks:
+    -   id: flake8
+        args: [
+            '--per-file-ignores=__init__.py:F401',
+            '--max-line-length=88',
+            '--ignore=E203'
+        ]
+
+# Checks types
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: 'v0.971'
+    hooks:
+    -   id: mypy
+        additional_dependencies: [data-science-types>=0.2, torch>=1.6]
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 archinet.ai
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,98 @@
+
+# Audio Data - PyTorch
+
+A collection of useful audio datasets and transforms for PyTorch.
+
+## Install
+
+```bash
+pip install audio-data-pytorch
+```
+
+[![PyPI - Python Version](https://img.shields.io/pypi/v/audio-data-pytorch?style=flat&colorA=0f0f0f&colorB=0f0f0f)](https://pypi.org/project/audio-data-pytorch/)
+
+## Datasets
+
+### WAV Dataset
+
+Load one or multiple folders of `.wav` files as dataset.
+
+```py
+from audio_data_pytorch import WAVDataset
+
+dataset = WAVDataset(path=['my/path1', 'my/path2'])
+```
+
+#### Full API:
+```py
+WAVDataset(
+    path: Union[str, Sequence[str]], # Path or list of paths from which to load files
+    recursive: bool = False # Recursively load files from provided paths
+    with_sample_rate: bool = False, # Returns sample rate as second argument
+    transforms: Optional[Callable] = None, # Transforms to apply to audio files
+)
+```
+
+### LJSpeech Dataset
+An unsupervised dataset for LJSpeech with voice only data
+```py
+from audio_data_pytorch import LJSpeechDataset
+
+dataset = LJSpeechDataset(root='./data')
+
+dataset[0] # (1, 158621)
+dataset[1] # (1, 153757)
+```
+
+#### Full API:
+```py
+LJSpeechDataset(
+    root: str = "./data", # The root where the dataset will be downloaded
+    with_sample_rate: bool = False, # Returns sample rate as second argument
+    transforms: Optional[Callable] = None, # Transforms to apply to audio files
+)
+```
+
+### Youtube Dataset
+A wrapper around yt-dlp that automatically downloads the audio source of Youtube videos.
+
+```py
+dataset = YoutubeDataset(
+    root='./data',
+    urls=[
+        "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
+        "https://www.youtube.com/watch?v=BZ-_KQezKmU",
+    ],
+    crop_length=10 # Crop source in 10s chunks (optional but suggested)
+)
+dataset[0] # (2, 480000)
+```
+
+#### Full API:
+```py
+dataset = YoutubeDataset(
+    urls: Sequence[str], # The list of youtube urls
+    root: str = "./data", # The root where the dataset will be downloaded
+    crop_length: Optional[int] = None, # Crops the source into chunks of `crop_length` seconds
+    with_sample_rate: bool = False, # Returns sample rate as second argument
+    transforms: Optional[Callable] = None, # Transforms to apply to audio files
+)
+```
+
+
+## Transforms
+
+An example
+
+```py
+
+crop = Crop(22050) # Crop start of audio track
+
+transforms = nn.Sequential(
+    Resample(source=48000, target=22050), # Resample from 48kHz to 22kHz
+    OverlapChannels(), # Overap channels by sum (C, N) -> (1, N)
+    RandomCrop(22050 * 3), # Random crop from file
+    Scale(0.8) # Scale waveform
+)
+
+```
diff --git a/audio_data_pytorch/__init__.py b/audio_data_pytorch/__init__.py
@@ -0,0 +1,4 @@
+from .ljspeech_dataset import LJSpeechDataset
+from .transforms import Crop, OverlapChannels, RandomCrop, Resample, Scale
+from .wav_dataset import WAVDataset
+from .youtube_dataset import YoutubeDataset
diff --git a/audio_data_pytorch/ljspeech_dataset.py b/audio_data_pytorch/ljspeech_dataset.py
@@ -0,0 +1,57 @@
+import os
+import tarfile
+
+import requests  # type: ignore
+from tqdm import tqdm
+
+from .utils import camel_to_snake
+from .wav_dataset import WAVDataset
+
+
+class LJSpeechDataset(WAVDataset):
+
+    data_url = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"
+    data_tar_file = "LJSpeech-1.1.tar.bz2"
+    data_waws_path = "LJSpeech-1.1/wavs"
+
+    def __init__(self, root: str = "./data", **kwargs) -> None:
+        self.root = root
+
+        if not os.path.exists(self.data_path):
+            print(
+                f"Data not found in {self.data_path}, downloading {self.data_tar_file}"
+            )
+            self.download()
+
+        super().__init__(path=self.wavs_path, **kwargs)
+
+    @property
+    def data_path(self) -> str:
+        return os.path.join(self.root, camel_to_snake(self.__class__.__name__))
+
+    @property
+    def file_path(self) -> str:
+        return os.path.join(self.data_path, self.data_tar_file)
+
+    @property
+    def wavs_path(self) -> str:
+        return os.path.join(self.data_path, self.data_waws_path)
+
+    def download(self) -> None:
+        os.makedirs(self.data_path, exist_ok=True)
+        response = requests.get(self.data_url, stream=True)
+        block_size = 1024  # Kibibyte
+        progress_bar = tqdm(total=block_size, unit="iB", unit_scale=True)
+
+        with open(self.file_path, "wb") as file:
+            for data in response.iter_content(block_size):
+                progress_bar.update(len(data))
+                file.write(data)
+        progress_bar.close()
+        self.decompress()
+
+    def decompress(self) -> None:
+        print(f"Decompressing {self.data_tar_file} to {self.data_path}")
+        file = tarfile.open(self.file_path)
+        file.extractall(self.data_path)
+        file.close()
diff --git a/audio_data_pytorch/transforms.py b/audio_data_pytorch/transforms.py
@@ -0,0 +1,82 @@
+import random
+
+import torch
+import torchaudio
+from torch import Tensor, nn
+
+
+class Crop(nn.Module):
+    """Crops waveform to fixed size"""
+
+    def __init__(self, size: int, start: int = 0) -> None:
+        super().__init__()
+        self.size = size
+        self.start = start
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = x[:, self.start :]
+        channels, length = x.shape
+
+        if length < self.size:
+            padding_length = self.size - length
+            padding = torch.zeros(channels, padding_length).to(x)
+            return torch.cat([x, padding], dim=1)
+        else:
+            return x[:, 0 : self.size]
+
+
+class RandomCrop(nn.Module):
+    """Crops random chunk from the waveform"""
+
+    def __init__(self, size: int) -> None:
+        super().__init__()
+        self.size = size
+
+    def forward(self, x: Tensor) -> Tensor:
+        # Pick start position
+        length = x.shape[1]
+        start = random.randint(0, max(length - self.size, 0))
+        # Crop from random start
+        x = x[:, start:]
+        channels, length = x.shape
+        # Pad to end if not large enough, else crop end
+        if length < self.size:
+            padding_length = self.size - length
+            padding = torch.zeros(channels, padding_length).to(x)
+            return torch.cat([x, padding], dim=1)
+        else:
+            return x[:, 0 : self.size]
+
+
+class OverlapChannels(nn.Module):
+    """Overlaps all channels into one"""
+
+    def forward(self, x: Tensor) -> Tensor:
+        return torch.sum(x, dim=0, keepdim=True)  # 'c l -> 1 l'
+
+
+class Resample(nn.Module):
+    """Resamples frequency of waveform"""
+
+    def __init__(self, source: int, target: int):
+        super().__init__()
+        self.transform = torchaudio.transforms.Resample(
+            orig_freq=source, new_freq=target
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.transform(x)
+
+
+class Scale(nn.Module):
+    """Scales waveform (change volume)"""
+
+    def __init__(
+        self,
+        scale: float,
+    ):
+        super().__init__()
+        self.scale = scale
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x * self.scale
diff --git a/audio_data_pytorch/utils.py b/audio_data_pytorch/utils.py
@@ -0,0 +1,15 @@
+import re
+from typing import Optional, TypeVar
+
+from typing_extensions import TypeGuard
+
+T = TypeVar("T")
+
+
+def exists(val: Optional[T]) -> TypeGuard[T]:
+    return val is not None
+
+
+def camel_to_snake(name: str) -> str:
+    name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
+    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower()