Skip to content

Commit

Permalink
Add LibriTTS dataset (#790)
Browse files Browse the repository at this point in the history
* Add libritts

Add LibriTTS dataset draft

* Add libritts

Use two separate ids for utterance_id.

* Update output form

Use full_id as utterance_id.

* Update format

Add space and test black format

* Update test method

* Add audio and text test

Generate audio and test files on-the-fly in test 

* Update format

* Fix test error and remove assets libritts

The test error is fixed by sorting the file in 4th element instead of 2nd element in samples. Since the files are generated on-the-fly, so the the libritts files in assets are removed.

* Add seed in `get_whitenoise` function

* Change utterance to text

Change `_utterance` to `_text`.

Co-authored-by: Ji Chen <[email protected]>
  • Loading branch information
jimchen90 and Ji Chen authored Jul 20, 2020
1 parent 209858e commit 4b8aad7
Show file tree
Hide file tree
Showing 4 changed files with 204 additions and 0 deletions.
8 changes: 8 additions & 0 deletions docs/source/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ LIBRISPEECH
:special-members:


LIBRITTS
~~~~~~~~

.. autoclass:: LIBRITTS
:members: __getitem__
:special-members:


LJSPEECH
~~~~~~~~

Expand Down
63 changes: 63 additions & 0 deletions test/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from torchaudio.datasets.ljspeech import LJSPEECH
from torchaudio.datasets.gtzan import GTZAN
from torchaudio.datasets.cmuarctic import CMUARCTIC
from torchaudio.datasets.libritts import LIBRITTS

from .common_utils import (
TempDirMixin,
Expand Down Expand Up @@ -110,5 +111,67 @@ def test_yesno(self):
assert label == expected_label


class TestLibriTTS(TempDirMixin, TorchaudioTestCase):
backend = 'default'

root_dir = None
data = []
utterance_ids = [
[19, 198, '000000', '000000'],
[26, 495, '000004', '000000'],
]
original_text = 'this is the original text.'
normalized_text = 'this is the normalized text.'

@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
base_dir = os.path.join(cls.root_dir, 'LibriTTS', 'train-clean-100')
for i, utterance_id in enumerate(cls.utterance_ids):
filename = f'{"_".join(str(u) for u in utterance_id)}.wav'
file_dir = os.path.join(base_dir, str(utterance_id[0]), str(utterance_id[1]))
os.makedirs(file_dir, exist_ok=True)
path = os.path.join(file_dir, filename)

data = get_whitenoise(sample_rate=8000, duration=6, n_channels=1, dtype='int16', seed=i)
save_wav(path, data, 8000)
cls.data.append(normalize_wav(data))

original_text_filename = f'{"_".join(str(u) for u in utterance_id)}.original.txt'
path_original = os.path.join(file_dir, original_text_filename)
f = open(path_original, 'w')
f.write(cls.original_text)
f.close()

normalized_text_filename = f'{"_".join(str(u) for u in utterance_id)}.normalized.txt'
path_normalized = os.path.join(file_dir, normalized_text_filename)
f = open(path_normalized, 'w')
f.write(cls.normalized_text)
f.close()

def test_libritts(self):
dataset = LIBRITTS(self.root_dir)
samples = list(dataset)
samples.sort(key=lambda s: s[4])

for i, (waveform,
sample_rate,
original_text,
normalized_text,
speaker_id,
chapter_id,
utterance_id) in enumerate(samples):

expected_ids = self.utterance_ids[i]
expected_data = self.data[i]
self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
assert sample_rate == 8000
assert speaker_id == expected_ids[0]
assert chapter_id == expected_ids[1]
assert original_text == self.original_text
assert normalized_text == self.normalized_text
assert utterance_id == f'{"_".join(str(u) for u in expected_ids[-4:])}'


if __name__ == "__main__":
unittest.main()
2 changes: 2 additions & 0 deletions torchaudio/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .yesno import YESNO
from .ljspeech import LJSPEECH
from .cmuarctic import CMUARCTIC
from .libritts import LIBRITTS

__all__ = (
"COMMONVOICE",
Expand All @@ -17,6 +18,7 @@
"LJSPEECH",
"GTZAN",
"CMUARCTIC",
"LIBRITTS"
"diskcache_iterator",
"bg_iterator",
)
131 changes: 131 additions & 0 deletions torchaudio/datasets/libritts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import os
from typing import Tuple

import torchaudio
from torch import Tensor
from torch.utils.data import Dataset
from torchaudio.datasets.utils import (
download_url,
extract_archive,
walk_files,
)

URL = "train-clean-100"
FOLDER_IN_ARCHIVE = "LibriTTS"
_CHECKSUMS = {
"http://www.openslr.org/60/dev-clean.tar.gz": "0c3076c1e5245bb3f0af7d82087ee207",
"http://www.openslr.org/60/dev-other.tar.gz": "815555d8d75995782ac3ccd7f047213d",
"http://www.openslr.org/60/test-clean.tar.gz": "7bed3bdb047c4c197f1ad3bc412db59f",
"http://www.openslr.org/60/test-other.tar.gz": "ae3258249472a13b5abef2a816f733e4",
"http://www.openslr.org/60/train-clean-100.tar.gz": "4a8c202b78fe1bc0c47916a98f3a2ea8",
"http://www.openslr.org/60/train-clean-360.tar.gz": "a84ef10ddade5fd25df69596a2767b2d",
"http://www.openslr.org/60/train-other-500.tar.gz": "7b181dd5ace343a5f38427999684aa6f",
}


def load_libritts_item(
fileid: str,
path: str,
ext_audio: str,
ext_original_txt: str,
ext_normalized_txt: str,
) -> Tuple[Tensor, int, str, str, int, int, str]:
speaker_id, chapter_id, segment_id, utterance_id = fileid.split("_")
utterance_id = fileid

normalized_text = utterance_id + ext_normalized_txt
normalized_text = os.path.join(path, speaker_id, chapter_id, normalized_text)

original_text = utterance_id + ext_original_txt
original_text = os.path.join(path, speaker_id, chapter_id, original_text)

file_audio = utterance_id + ext_audio
file_audio = os.path.join(path, speaker_id, chapter_id, file_audio)

# Load audio
waveform, sample_rate = torchaudio.load(file_audio)

# Load original text
with open(original_text) as ft:
original_text = ft.readline()

# Load normalized text
with open(normalized_text, "r") as ft:
normalized_text = ft.readline()

return (
waveform,
sample_rate,
original_text,
normalized_text,
int(speaker_id),
int(chapter_id),
utterance_id,
)


class LIBRITTS(Dataset):
"""
Create a Dataset for LibriTTS. Each item is a tuple of the form:
waveform, sample_rate, original_text, normalized_text, speaker_id, chapter_id, utterance_id
"""

_ext_original_txt = ".original.txt"
_ext_normalized_txt = ".normalized.txt"
_ext_audio = ".wav"

def __init__(
self,
root: str,
url: str = URL,
folder_in_archive: str = FOLDER_IN_ARCHIVE,
download: bool = False,
) -> None:

if url in [
"dev-clean",
"dev-other",
"test-clean",
"test-other",
"train-clean-100",
"train-clean-360",
"train-other-500",
]:

ext_archive = ".tar.gz"
base_url = "http://www.openslr.org/resources/60/"

url = os.path.join(base_url, url + ext_archive)

basename = os.path.basename(url)
archive = os.path.join(root, basename)

basename = basename.split(".")[0]
folder_in_archive = os.path.join(folder_in_archive, basename)

self._path = os.path.join(root, folder_in_archive)

if download:
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum)
extract_archive(archive)

walker = walk_files(
self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True
)
self._walker = list(walker)

def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int, int, str]:
fileid = self._walker[n]
return load_libritts_item(
fileid,
self._path,
self._ext_audio,
self._ext_original_txt,
self._ext_normalized_txt,
)

def __len__(self) -> int:
return len(self._walker)

0 comments on commit 4b8aad7

Please sign in to comment.