-
Notifications
You must be signed in to change notification settings - Fork 661
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add tedlium dataset (all 3 releases) #882
Changes from 10 commits
b695d9d
d684ac7
e1b3256
3cde3eb
9655036
e76ba7a
d3fede5
556a7f3
3f18636
8a0b922
90d1db1
5125ebf
f6bae1c
0dfda8d
eecd46a
b38a13c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,179 @@ | ||||||
import os | ||||||
from typing import Tuple | ||||||
|
||||||
import torchaudio | ||||||
from torch import Tensor | ||||||
from torch.utils.data import Dataset | ||||||
from torchaudio.datasets.utils import ( | ||||||
download_url, | ||||||
extract_archive, | ||||||
walk_files, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can remove |
||||||
) | ||||||
from collections import namedtuple | ||||||
|
||||||
|
||||||
_RELEASE_CONFIGS = { | ||||||
mthrok marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
"release1": { | ||||||
"folder_in_archive": "TEDLIUM_release1", | ||||||
"url": "http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz", | ||||||
"checksum": "30301975fd8c5cac4040c261c0852f57cfa8adbbad2ce78e77e4986957445f27", | ||||||
"data_path": "", | ||||||
"subset": "train", | ||||||
"dict": "TEDLIUM.150K.dic", | ||||||
}, | ||||||
"release2": { | ||||||
"folder_in_archive": "TEDLIUM_release2", | ||||||
"url": "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz", | ||||||
"checksum": "93281b5fcaaae5c88671c9d000b443cb3c7ea3499ad12010b3934ca41a7b9c58", | ||||||
"data_path": "", | ||||||
"subset": "train", | ||||||
"dict": "TEDLIUM.152k.dic", | ||||||
}, | ||||||
"release3": { | ||||||
"folder_in_archive": "TEDLIUM_release-3", | ||||||
"url": "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz", | ||||||
"checksum": "ad1e454d14d1ad550bc2564c462d87c7a7ec83d4dc2b9210f22ab4973b9eccdb", | ||||||
"data_path": "data/", | ||||||
"subset": None, | ||||||
"dict": "TEDLIUM.152k.dic", | ||||||
}, | ||||||
} | ||||||
|
||||||
Tedlium_item = namedtuple( | ||||||
"Tedlium_item", ["waveform", "sample_rate", "transcript", "talk_id", "speaker_id", "identifier"] | ||||||
) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No problem! |
||||||
|
||||||
|
||||||
class TEDLIUM(Dataset): | ||||||
""" | ||||||
Create a Dataset for Tedlium. Each item is a tuple of the form: | ||||||
[waveform, sample_rate, transcript, talk_id, speaker_id, identifier] | ||||||
""" | ||||||
|
||||||
def __init__( | ||||||
self, root: str, release: str = "release1", subset: str = None, download: bool = False, audio_ext=".sph" | ||||||
) -> None: | ||||||
"""Constructor for TEDLIUM dataset | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you move this docstring to class-level docstring and merge them? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just checked with the other datasets, they have a similar docstring under the class description:
Librispeech:
Are you sure you want to move the big constructor docstring to class-level? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes. The fact that the other dataset lacks the documentation for the constructor is not user-friendly. |
||||||
|
||||||
Args: | ||||||
root (str): Path containing dataset or target path where its downloaded if needed | ||||||
release (str, optional): TEDLIUM identifier (release1,release2,release3). Defaults to RELEASE. | ||||||
subset (str, optional): train/dev/test for releases 1&2, None for release3. Defaults to Train/None | ||||||
download (bool, optional): Download dataset in case is not founded in root path. Defaults to False. | ||||||
audio_ext (str, optional): Overwrite audio extension when loading items. Defaults to ".sph". | ||||||
|
||||||
Raises: | ||||||
RuntimeError: If release identifier does not match any supported release, | ||||||
""" | ||||||
self._ext_audio = audio_ext | ||||||
if release in _RELEASE_CONFIGS.keys(): | ||||||
folder_in_archive = _RELEASE_CONFIGS[release]["folder_in_archive"] | ||||||
url = _RELEASE_CONFIGS[release]["url"] | ||||||
subset = subset if subset else _RELEASE_CONFIGS[release]["subset"] | ||||||
else: | ||||||
# Raise warning | ||||||
raise RuntimeError( | ||||||
"The release {} does not match any of the supported tedlium releases{} ".format( | ||||||
release, _RELEASE_CONFIGS.keys(), | ||||||
) | ||||||
) | ||||||
|
||||||
basename = os.path.basename(url) | ||||||
archive = os.path.join(root, basename) | ||||||
|
||||||
basename = basename.split(".")[0] | ||||||
|
||||||
self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"]) | ||||||
if subset in ["train", "dev", "test"]: | ||||||
self._path = os.path.join(self._path, subset) | ||||||
if download: | ||||||
if not os.path.isdir(self._path): | ||||||
if not os.path.isfile(archive): | ||||||
checksum = _RELEASE_CONFIGS[release]["checksum"] | ||||||
download_url(url, root, hash_value=checksum) | ||||||
extract_archive(archive) | ||||||
|
||||||
# Create walker for all samples | ||||||
self._walker = [] | ||||||
stm_path = os.path.join(self._path, "stm") | ||||||
for file in os.listdir(stm_path): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you sort the file? Different OS returns files in different order.
Suggested change
|
||||||
if file.endswith(".stm"): | ||||||
stm_path = os.path.join(self._path, "stm", file) | ||||||
with open(stm_path) as f: | ||||||
l = len(f.readlines()) | ||||||
file = file.replace(".stm", "") | ||||||
self._walker.extend((file, line) for line in range(l)) | ||||||
|
||||||
# Read phoneme dictionary | ||||||
dict_path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["dict"]) | ||||||
self.phoneme_dict = {} | ||||||
with open(dict_path, "r", encoding="utf-8") as f: | ||||||
for line in f.readlines(): | ||||||
content = line.strip().split(maxsplit=1) | ||||||
self.phoneme_dict[content[0]] = content[1:] # content[1:] can be empty list | ||||||
|
||||||
def load_tedlium_item(self, fileid: str, line: int, path: str) -> Tedlium_item: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you also mark this function as private?
Suggested change
|
||||||
"""Loads a TEDLIUM dataset sample given a file name and corresponding sentence name | ||||||
|
||||||
Args: | ||||||
fileid (str): File id to identify both text and audio files corresponding to the sample | ||||||
line (int): Line identifier for the sample inside the text file | ||||||
path (str): Dataset root path | ||||||
|
||||||
Returns: | ||||||
Tedlium_item: A namedTuple containing [waveform, sample_rate, transcript, talk_id, speaker_id, identifier] | ||||||
""" | ||||||
transcript_path = os.path.join(path, "stm", fileid) | ||||||
with open(transcript_path + ".stm") as f: | ||||||
transcript = f.readlines()[line] | ||||||
talk_id, _, speaker_id, start_time, end_time, identifier, transcript = transcript.split(" ", 6) | ||||||
|
||||||
wave_path = os.path.join(path, "sph", fileid) | ||||||
waveform, sample_rate = self._load_audio(wave_path + self._ext_audio, start_time=start_time, end_time=end_time) | ||||||
|
||||||
return Tedlium_item(waveform, sample_rate, transcript, talk_id, speaker_id, identifier) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't find the discussion about NamedTuple, so I'm leaving a new comment here :) There has been prior discussion about changing the type of data points (and in particular using NamedTuple), see internal document. Though there may be value in redefining what a data point is, I feel the discussion would delay this pull request, so I would avoid it for now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't access the document but regardless I have no problem with changing it back to the previous version. We can delay this NamedTuple to a PR that will change multiple Datasets/files and not do 1 by 1 and get inconsistent code. What do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, I agree with your suggestion. It would be preferable to change them all at the same time once we settle on a specific format. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I oppose to using tuple here. 6 heterogeneous items as tuple give very bad user experience and make code harder to read/maintain. Specifically named tuple is compatible with tuple, there is virtually no disadvantage of using named tuple here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The main disadvantage of using namedtuple here is (1) inconsistent behavior with other datasets in audio and elsewhere, (2) would make the user leverage an API that could change in the near future, (3) will slow down this particular pull request. The reason we did not standardize around NamedTuple in the past was that it didn't allow for easy introspection of keys, and each dataset would have its own DataPoint NamedTuple. Please see internal document for more discussions. We are also using tuples with many items with other torchaudio datasets. That being said, we can revisit these design choices. Thoughts @zhangguanheng66 @fmassa @cpuhrsch ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Can you elaborate this? Each dataset returns different number of items, so it does not look like they can be swapped easily from the beginning. And using NamedTuple with consistent key name will make it possible to give consistent user experience.
Not sure what this means. Can you elaborate?
It's already in NamedTuple so, it seems that changing it tuple seems to be slowing down the process.
When is it necessary or useful to perform key introspection?
That sounds the right approach. Because each dataset returns the different number of items of different types. They should be typed differently. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I was really just referring to using NamedTuple in this particular pull request. Moving toward consistently using NamedTuple in torchaudio/text/vision is fine by me.
Since we have not, as a team (torchaudio/text/vision), revisited the decision of using Tuple, using NamedTuple here exposes us to changing the API again would we settle on another option.
But we should avoid using NamedTuple until we, as a team, have made a decision on the API for data point. The current decision had been Tuple, but I'm personally fine with revisiting this past decision.
I don't feel too strongly about either points. I am really just listing reasons that we left in the internal document. :) |
||||||
|
||||||
def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]: | ||||||
"""Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality | ||||||
and load individual sentnces from a full ted audio talk file | ||||||
|
||||||
Args: | ||||||
path (str): Path to audio file | ||||||
start_time (int, optional): Time in seconds where the sample sentence stars | ||||||
end_time (int, optional): Time in seconds where the sample sentence finishes | ||||||
|
||||||
Returns: | ||||||
[Tensor, int]: Audio tensor representation and sample rate | ||||||
""" | ||||||
start_time = int(float(start_time) * 16000) | ||||||
end_time = int(float(end_time) * 16000) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you use |
||||||
return torchaudio.load(path, frame_offset=start_time, num_frames=end_time - start_time) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just realized that this will only work with
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current sox backend also support reading file segment. Not soundfile? If the datasets needs to be aware of the backend, then we should see how we could evolve the backends also so that this does not need to happen. Thoughts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We either:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We have already planned to resolve this in 0.9.0 release. See #903 |
||||||
|
||||||
def __getitem__(self, n: int) -> Tedlium_item: | ||||||
"""TEDLIUM dataset custom function overwritting default loadbehaviour. | ||||||
Loads a TEDLIUM sample given a index N | ||||||
|
||||||
Args: | ||||||
n (int): Index of sample to be loaded | ||||||
|
||||||
Returns: | ||||||
Tedlium_item: A namedTuple containing [waveform, sample_rate, transcript, talk_id, speaker_id, identifier] | ||||||
""" | ||||||
fileid, line = self._walker[n] | ||||||
return self.load_tedlium_item(fileid, line, self._path) | ||||||
|
||||||
def __len__(self) -> int: | ||||||
"""DTEDLIUM dataset custom function overwritting len default behaviour. | ||||||
|
||||||
Returns: | ||||||
int: TEDLIUM dataset length | ||||||
""" | ||||||
return len(self._walker) | ||||||
|
||||||
def get_phoneme_dict(self): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||
"""Returns the phoneme dictionary of a TEDLIUM release | ||||||
|
||||||
Returns: | ||||||
dictionary: Phoneme dictionary for the current tedlium release | ||||||
""" | ||||||
return self.phoneme_dict |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@mthrok What does special-members mean here? I interpretate it as extra functions to include in the docs? Thats why I included get_phoneme_dict
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are Sphinx's directive. Check out their documentation
:members:
is where you list the members you want document:special-members:
is where you list the special methods like__init__
,__len__
,__getitem__
etc...I think the other documentations are wrong, (
__getitem__
should be under:special-member:
but it will not show up either way because they don't have a docstring.)I think you can just do
.. autoclass:: TEDLIUM
and the rest (get_phoneme_dict
) will be handled.You can build the documentation and check how the resulting documentation looks like.