-
Notifications
You must be signed in to change notification settings - Fork 661
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add tedlium dataset (all 3 releases) #882
Merged
Merged
Changes from all commits
Commits
Show all changes
16 commits
Select commit
Hold shift + click to select a range
b695d9d
Added tedlium support for 3 releases
jiwidi d684ac7
Minor fixes from PR feedback and better formatting
jiwidi e1b3256
Minor fixes from PR feedback and better formatting
jiwidi 3cde3eb
Minor fixes from PR feedback and better formatting
jiwidi 9655036
Minor fixes from PR feedback and docstrings
jiwidi e76ba7a
Style fix
jiwidi d3fede5
Changes from PR feedback
jiwidi 556a7f3
Changes from PR feedback and phoneme dict function
jiwidi 3f18636
Changes from PR feedback
jiwidi 8a0b922
Changes to dataset docs, adding tedlium
jiwidi 90d1db1
Changes from PR feedback
jiwidi 5125ebf
Tedlium test and minor improvements to tedlium class
jiwidi f6bae1c
Created test for every release and improvements from PR feedback
jiwidi 0dfda8d
PR feedback changes
jiwidi eecd46a
Test for dic loading and fix naming private variables
jiwidi b38a13c
fix style for tedlium test
jiwidi File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
import os | ||
|
||
from torchaudio.datasets import tedlium | ||
|
||
from torchaudio_unittest.common_utils import ( | ||
TempDirMixin, | ||
TorchaudioTestCase, | ||
get_whitenoise, | ||
save_wav, | ||
normalize_wav, | ||
) | ||
|
||
# Used to generate a unique utterance for each dummy audio file | ||
UTTERANCES = [ | ||
"AaronHuey_2010X 1 AaronHuey_2010X 0.0 2.0 <o,f0,female> script1\n", | ||
"AaronHuey_2010X 1 AaronHuey_2010X 2.0 4.0 <o,f0,female> script2\n", | ||
"AaronHuey_2010X 1 AaronHuey_2010X 4.0 6.0 <o,f0,female> script3\n", | ||
"AaronHuey_2010X 1 AaronHuey_2010X 6.0 8.0 <o,f0,female> script4\n", | ||
"AaronHuey_2010X 1 AaronHuey_2010X 8.0 10.0 <o,f0,female> script5\n", | ||
] | ||
|
||
PHONEME = [ | ||
"a AH", | ||
"a(2) EY", | ||
"aachen AA K AH N", | ||
"aad AE D", | ||
"aaden EY D AH N", | ||
"aadmi AE D M IY", | ||
"aae EY EY", | ||
] | ||
|
||
|
||
class TestTedlium(TempDirMixin, TorchaudioTestCase): | ||
backend = "default" | ||
|
||
root_dir = None | ||
samples = {} | ||
|
||
@classmethod | ||
def setUpClass(cls): | ||
cls.root_dir = cls.get_base_temp_dir() | ||
cls.root_dir = dataset_dir = os.path.join(cls.root_dir, "tedlium") | ||
os.makedirs(dataset_dir, exist_ok=True) | ||
sample_rate = 16000 # 16kHz | ||
seed = 0 | ||
|
||
for release in ["release1", "release2", "release3"]: | ||
data = get_whitenoise(sample_rate=sample_rate, duration=10.00, n_channels=1, dtype="float32", seed=seed) | ||
if release in ["release1", "release2"]: | ||
release_dir = os.path.join( | ||
dataset_dir, | ||
tedlium._RELEASE_CONFIGS[release]["folder_in_archive"], | ||
tedlium._RELEASE_CONFIGS[release]["subset"], | ||
) | ||
else: | ||
release_dir = os.path.join( | ||
dataset_dir, | ||
tedlium._RELEASE_CONFIGS[release]["folder_in_archive"], | ||
tedlium._RELEASE_CONFIGS[release]["data_path"], | ||
) | ||
os.makedirs(release_dir, exist_ok=True) | ||
os.makedirs(os.path.join(release_dir, "stm"), exist_ok=True) # Subfolder for transcripts | ||
os.makedirs(os.path.join(release_dir, "sph"), exist_ok=True) # Subfolder for audio files | ||
filename = f"{release}.sph" | ||
path = os.path.join(os.path.join(release_dir, "sph"), filename) | ||
save_wav(path, data, sample_rate) | ||
|
||
trans_filename = f"{release}.stm" | ||
trans_path = os.path.join(os.path.join(release_dir, "stm"), trans_filename) | ||
with open(trans_path, "w") as f: | ||
f.write("".join(UTTERANCES)) | ||
|
||
dict_filename = f"{release}.dic" | ||
dict_path = os.path.join(release_dir, dict_filename) | ||
with open(dict_path, "w") as f: | ||
f.write("\n".join(PHONEME)) | ||
|
||
# Create a samples list to compare with | ||
cls.samples[release] = [] | ||
for utterance in UTTERANCES: | ||
talk_id, _, speaker_id, start_time, end_time, identifier, transcript = utterance.split(" ", 6) | ||
start_time = int(float(start_time)) * sample_rate | ||
end_time = int(float(end_time)) * sample_rate | ||
sample = ( | ||
data[:, start_time:end_time], | ||
sample_rate, | ||
transcript, | ||
talk_id, | ||
speaker_id, | ||
identifier, | ||
) | ||
cls.samples[release].append(sample) | ||
seed += 1 | ||
|
||
def test_tedlium_release1(self): | ||
release = "release1" | ||
dataset = tedlium.TEDLIUM(self.root_dir, release=release) | ||
num_samples = 0 | ||
for i, (data, sample_rate, transcript, talk_id, speaker_id, identifier) in enumerate(dataset): | ||
self.assertEqual(data, self.samples[release][i][0], atol=5e-5, rtol=1e-8) | ||
assert sample_rate == self.samples[release][i][1] | ||
assert transcript == self.samples[release][i][2] | ||
assert talk_id == self.samples[release][i][3] | ||
assert speaker_id == self.samples[release][i][4] | ||
assert identifier == self.samples[release][i][5] | ||
num_samples += 1 | ||
|
||
assert num_samples == len(self.samples[release]) | ||
|
||
dataset._dict_path = os.path.join(dataset._path, f"{release}.dic") | ||
phoneme_dict = dataset.phoneme_dict | ||
phoenemes = [f"{key} {' '.join(value)}" for key, value in phoneme_dict.items()] | ||
assert phoenemes == PHONEME | ||
|
||
def test_tedlium_release2(self): | ||
release = "release2" | ||
dataset = tedlium.TEDLIUM(self.root_dir, release=release) | ||
num_samples = 0 | ||
for i, (data, sample_rate, transcript, talk_id, speaker_id, identifier) in enumerate(dataset): | ||
self.assertEqual(data, self.samples[release][i][0], atol=5e-5, rtol=1e-8) | ||
assert sample_rate == self.samples[release][i][1] | ||
assert transcript == self.samples[release][i][2] | ||
assert talk_id == self.samples[release][i][3] | ||
assert speaker_id == self.samples[release][i][4] | ||
assert identifier == self.samples[release][i][5] | ||
num_samples += 1 | ||
|
||
assert num_samples == len(self.samples[release]) | ||
|
||
dataset._dict_path = os.path.join(dataset._path, f"{release}.dic") | ||
phoneme_dict = dataset.phoneme_dict | ||
phoenemes = [f"{key} {' '.join(value)}" for key, value in phoneme_dict.items()] | ||
assert phoenemes == PHONEME | ||
|
||
def test_tedlium_release3(self): | ||
release = "release3" | ||
dataset = tedlium.TEDLIUM(self.root_dir, release=release) | ||
num_samples = 0 | ||
for i, (data, sample_rate, transcript, talk_id, speaker_id, identifier) in enumerate(dataset): | ||
self.assertEqual(data, self.samples[release][i][0], atol=5e-5, rtol=1e-8) | ||
assert sample_rate == self.samples[release][i][1] | ||
assert transcript == self.samples[release][i][2] | ||
assert talk_id == self.samples[release][i][3] | ||
assert speaker_id == self.samples[release][i][4] | ||
assert identifier == self.samples[release][i][5] | ||
num_samples += 1 | ||
|
||
assert num_samples == len(self.samples[release]) | ||
|
||
dataset._dict_path = os.path.join(dataset._path, f"{release}.dic") | ||
phoneme_dict = dataset.phoneme_dict | ||
phoenemes = [f"{key} {' '.join(value)}" for key, value in phoneme_dict.items()] | ||
assert phoenemes == PHONEME |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@mthrok What does special-members mean here? I interpretate it as extra functions to include in the docs? Thats why I included get_phoneme_dict
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are Sphinx's directive. Check out their documentation
:members:
is where you list the members you want document:special-members:
is where you list the special methods like__init__
,__len__
,__getitem__
etc...I think the other documentations are wrong, (
__getitem__
should be under:special-member:
but it will not show up either way because they don't have a docstring.)I think you can just do
.. autoclass:: TEDLIUM
and the rest (get_phoneme_dict
) will be handled.You can build the documentation and check how the resulting documentation looks like.