From 774bde8bafdc2c286f227a307a20c33dc7028cbc Mon Sep 17 00:00:00 2001 From: Matthew Maciejewski Date: Tue, 16 Jan 2024 11:37:15 -0500 Subject: [PATCH 01/10] initial commit --- lhotse/bin/modes/recipes/__init__.py | 1 + lhotse/bin/modes/recipes/sbcsae.py | 37 ++ lhotse/recipes/__init__.py | 1 + lhotse/recipes/sbcsae.py | 798 +++++++++++++++++++++++++++ 4 files changed, 837 insertions(+) create mode 100644 lhotse/bin/modes/recipes/sbcsae.py create mode 100644 lhotse/recipes/sbcsae.py diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py index 5df6ef8de..11a98395b 100644 --- a/lhotse/bin/modes/recipes/__init__.py +++ b/lhotse/bin/modes/recipes/__init__.py @@ -62,6 +62,7 @@ from .peoples_speech import * from .primewords import * from .rir_noise import * +from .sbcsae import * from .speechcommands import * from .spgispeech import * from .stcmds import * diff --git a/lhotse/bin/modes/recipes/sbcsae.py b/lhotse/bin/modes/recipes/sbcsae.py new file mode 100644 index 000000000..23f36e09b --- /dev/null +++ b/lhotse/bin/modes/recipes/sbcsae.py @@ -0,0 +1,37 @@ +from typing import Optional, Sequence + +import click + +from lhotse.bin.modes import download, prepare +from lhotse.recipes.sbcsae import download_sbcsae, prepare_sbcsae +from lhotse.utils import Pathlike + +__all__ = ["sbcsae"] + + +@prepare.command(context_settings=dict(show_default=True)) +@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True)) +@click.argument("output_dir", type=click.Path()) +def sbcsae( + corpus_dir: Pathlike, + output_dir: Pathlike, +): + """SBCSAE data preparation.""" + prepare_sbcsae(corpus_dir, output_dir=output_dir) + + +@download.command(context_settings=dict(show_default=True)) +@click.argument("target_dir", type=click.Path()) +@click.option( + "--download-mp3", + type=bool, + is_flag=True, + default=False, + help="Download the mp3 copy of the audio as well as wav.", +) +def sbcsae( + target_dir: Pathlike, + download_mp3: Optional[bool] = False, +): + """SBCSAE download.""" + download_sbcsae(target_dir, download_mp3=download_mp3) diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py index 57e129ea1..2fc7a6f0b 100644 --- a/lhotse/recipes/__init__.py +++ b/lhotse/recipes/__init__.py @@ -63,6 +63,7 @@ from .nsc import prepare_nsc from .peoples_speech import prepare_peoples_speech from .rir_noise import download_rir_noise, prepare_rir_noise +from .sbcsae import prepare_sbcsae from .speechcommands import download_speechcommands, prepare_speechcommands from .spgispeech import download_spgispeech, prepare_spgispeech from .stcmds import download_stcmds, prepare_stcmds diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py new file mode 100644 index 000000000..f855e2d01 --- /dev/null +++ b/lhotse/recipes/sbcsae.py @@ -0,0 +1,798 @@ +""" +This script downloads and prepares the data directory for the Santa Barbara +Corpus of Spoken American English. + +The Santa Barbara Corpus of Spoken American English is based on a large body of +recordings of naturally occurring spoken interaction from all over the United +States. The Santa Barbara Corpus represents a wide variety of people of +different regional origins, ages, occupations, genders, and ethnic and social +backgrounds. The predominant form of language use represented is face-to-face +conversation, but the corpus also documents many other ways that that people use +language in their everyday lives: telephone conversations, card games, food +preparation, on-the-job talk, classroom lectures, sermons, story-telling, town +hall meetings, tour-guide spiels, and more. + +The Santa Barbara Corpus was compiled by researchers in the Linguistics +Department of the University of California, Santa Barbara. The Director of the +Santa Barbara Corpus is John W. Du Bois, working with Associate Editors Wallace +L. Chafe and Sandra A. Thompson (all of UC Santa Barbara), and Charles Meyer +(UMass, Boston). For the publication of Parts 3 and 4, the authors are John W. +Du Bois and Robert Englebretson. + +TODO: detail on splits and such +""" +import logging +import re +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Sequence, Union + +from lhotse import Recording, RecordingSet, SupervisionSegment, SupervisionSet +from lhotse.utils import Pathlike, resumable_download + +TALKBANK_MP3_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/" +TALKBANK_WAV_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/0wav/" + +lang_iterators = { + "SBC004": iter(["Spanish"] * 17), + "SBC006": iter(["French"] * 2), + "SBC010": iter(["Spanish"]), + "SBC012": iter(["Greek"] * 2), + "SBC015": iter(["Spanish"] * 10), + "SBC025": iter(["German"] * 2 + ["Latin"]), + "SBC027": iter(["Spanish"] * 6 + ["French"] * 2), + "SBC031": iter(["French"] * 2), + "SBC033": iter(["French"]), + "SBC034": iter(["French"] * 3), + "SBC036": iter(["Spanish"] * 36), + "SBC037": iter(["Spanish"] * 60), + "SBC047": iter(["Spanish"]), + "SBC057": iter(["Japanese"] * 62), + "SBC058": iter(["Spanish"] + ["Italian"] * 2), +} + + +class Dummy_Spk_Iterator: + def __init__(self): + self.ind = 213 + + def next(self, spk="SBCXXX_X"): + self.ind = self.ind + 1 + name = "_".join(spk.split("_")[1:]) + if name.startswith("X") or name.startswith("AUD"): + name = "UNK" + return f"{self.ind:04d}_{name}" + + +dummy_spk_iterator = Dummy_Spk_Iterator() + + +def download_sbcsae( + target_dir: Pathlike = ".", + download_mp3: Optional[bool] = False, +) -> Path: + """ + Download the dataset. Due to availability/broken link issues, this downloads + from multiple sources. + + :param: target_dir: Pathlike, the path of the directory where the SBCSAE + dataset will be downloaded. + :param: download_mp3: bool, if True download the mp3 files as well as wav. + :return: The path to the directory with the data. + """ + target_dir = Path(target_dir) + corpus_dir = target_dir / "SBCSAE" + corpus_dir.mkdir(parents=True, exist_ok=True) + + completed_detector = target_dir / ".sbcsae_completed" + if completed_detector.is_fil(): + logging.info(f"Skipping download because {completed_detector} exists.") + return corpus_dir + return "FALSE" + + +def prepare_sbcsae( + corpus_dir: Pathlike, + output_dir: Optional[Pathlike] = None, +) -> Dict[str, Union[RecordingSet, SupervisionSet]]: + """ + Prepares manifest for SBCSAE dataset. + + :param: corpus_dir: Path to the root where SBCSAE data was downloaded. It + should be called SBCSAE. There is no consistent formatting between + releases of the data. Check script comments for details if using an + existing corpus download rather than Lhotse's download script. + :param: output_dir: Root directory where .json manifests are stored. + :return: + """ + # Resolve corpus_dir type + if isinstance(corpus_dir, str): + corpus_dir = Path(corpus_dir) + + # Resolve output_dir type + if isinstance(output_dir, str): + output_dir = Path(output_dir) + + audio_dir = corpus_dir / "WAV" + recordings = RecordingSet.from_recordings( + Recording.from_file(p) for p in audio_dir.glob("*.wav") + ) + if len(recordings) == 0: + logging.warning(f"No .wav files found in {audio_dir}") + + doc_dir = corpus_dir / "documentation" + spk2gen_dict, spk2glob_dict = generate_speaker_map_dicts(doc_dir) + + supervisions = [] + trn_dir = corpus_dir / "TRN" + for p in trn_dir.glob("*.trn"): + for supervision in _filename_to_supervisions(p, spk2gen_dict, spk2glob_dict): + supervisions.append(supervision) + + if len(supervisions) == 0: + logging.warning(f"No supervisions found in {trn_dir}") + supervisions = SupervisionSet.from_segments(supervisions) + + if output_dir is not None: + if isinstance(output_dir, str): + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + recordings.to_file(output_dir / "sbcsae_recordings.jsonl.gz") + supervisions.to_file(output_dir / "sbcsae_supervisions.jsonl.gz") + + manifests = {"recordings": recordings, "supervisions": supervisions} + + return manifests + + +def generate_speaker_map_dicts(doc_dir: Path): + spk2gen_dict = dict() + spk2glob_dict = dict() + + spk_num_to_reco_ids = dict() + for LDC_split in ["LDC2000S85", "LDC2003S06", "LDC2005S25"]: + filename = doc_dir / LDC_split / "segment.tbl" + for line in filename.read_text().split("\n"): + if "speaker:" in line: + line = line.replace(" 0", "\t0") + reco_id = re.sub(r"sbc0?([0-9]{3})\s.*", r"SBC\1", line) + spk_num = line.split("\t")[-1][:4] + if spk_num not in spk_num_to_reco_ids: + spk_num_to_reco_ids[spk_num] = [] + if reco_id not in spk_num_to_reco_ids[spk_num]: + spk_num_to_reco_ids[spk_num].append(reco_id) + + for LDC_split in ["LDC2000S85", "LDC2003S06", "LDC2005S25"]: + filename = doc_dir / LDC_split / "speaker.tbl" + for line in filename.read_text().split("\n"): + if "," not in line: + continue + line = line.replace("0163,Dan,m", "0166,Dan,M") + spk_num, name, gen = line.split(",")[:3] + name = ( + name.replace(" (extra-corpus)", "").upper().split(" ")[-1].split("/")[0] + ) + gen = gen.upper() + if not gen: + gen = None + + if spk_num in ["0069", "0091", "0092", "0097"]: + continue + for reco in spk_num_to_reco_ids[spk_num]: + spk2gen_dict[reco + "_" + name] = gen + spk2glob_dict[reco + "_" + name] = spk_num + "_" + name + + for LDC_split in ["LDC2004S10"]: + seg_list = [] + filename = doc_dir / LDC_split / "segment.tbl" + for line in filename.read_text().split("\n"): + if "speaker:" in line: + reco_id = re.sub(r"sbc0?([0-9]{3})\s.*", r"SBC\1", line) + name = line.split(" ")[-1].upper().split("/")[0] + seg_list.append([name, reco_id]) + + spk_list = [] + filename = doc_dir / LDC_split / "speaker.tbl" + for line in filename.read_text().split("\n"): + if "," not in line: + continue + spk_num, name, gen = line.split(",")[:3] + name = name.upper().split("/")[0] + spk_list.append([name, spk_num, gen]) + + for seg_info, spk_info in zip(seg_list, spk_list): + assert seg_info[0] == spk_info[0], f"{seg_info[0]} != {spk_info[0]}" + spk2gen_dict[seg_info[1] + "_" + seg_info[0]] = spk_info[2] + spk2glob_dict[seg_info[1] + "_" + seg_info[0]] = ( + spk_info[1] + "_" + spk_info[0] + ) + + for spk_key in [ + "SBC006_ALL", + "SBC008_ALL", + "SBC012_MANY", + "SBC020_AUD", + "SBC021_MANY", + "SBC023_MANY", + "SBC025_AUD", + "SBC026_AUD", + "SBC027_MANY", + "SBC027_AUD", + "SBC028_BOTH", + "SBC030_AUD", + "SBC038_AUD", + "SBC053_RADIO", + "SBC054_AUD", + "SBC054_MANY", + "SBC055_AUD", + ]: + spk2gen_dict[spk_key] = None + spk2glob_dict[spk_key] = spk_key + + return spk2gen_dict, spk2glob_dict + + +def _filename_to_supervisions(filename: Path, spk2gen_dict: dict, spk2glob_dict: dict): + reco_id = filename.stem.split(".")[0] + lines = filename.read_text(encoding="latin1") + supervisions = [] + + #### Transcript fix + lines = lines.replace("\x92", "'") + lines = lines.replace("\u007f", "") + lines = lines.replace("\u0000", "c") + + if reco_id == "SBC002": + lines = lines.replace("(TSK ", "(TSK) ") + elif reco_id == "SBC004": + lines = lines.replace("KATE", "KATHY") + lines = lines.replace("sen~orita", "se\xf1orita") + elif reco_id == "SBC005": + lines = lines.replace("good_/god/", "good") + lines = lines.replace("(H)@>", "(H) @>") + lines = lines.replace("[@@ <@Mm@>]", "[@@ <@ Mm @>]") + elif reco_id == "SBC006": + lines = lines.replace("/pub/", "pub") + lines = lines.replace("", "") + lines = lines.replace("[2(H)2]1", "[2(H)2]") + elif reco_id == "SBC007": + lines = lines.replace( + "\\000000000 000000000 MARY: 1182.90 1186.92\t ", + "\n1182.90 1186.92\tMARY: ", + ) + lines = lines.replace("(YAWN0", "(YAWN)") + elif reco_id == "SBC008": + lines = lines.replace("[", "[") + elif reco_id == "SBC012": + lines = lines.replace( + "\n".join(["807.02 807.92\tFRANK: \t.. Mhm."] * 2), + "807.02 807.92\tFRANK: \t.. Mhm.", + ) + lines = lines.replace("MONTOYA", "MONTOYO") + elif reco_id == "SBC013": + lines = lines.replace("[8<@She8]", "[8<@ She8]") + lines = lines.replace("[2(H) cou_ couch@>2]", "[2(H) cou_ couch @>2]") + lines = lines.replace("[4<@No=4]", "[4<@ No=4]") + lines = lines.replace("VOX2]", "VOX>2]") + elif reco_id == "SBC014": + lines = lines.replace("\\000000000 000000000 ", "\n") + lines = lines.replace("<@he thought", "<@ he thought") + elif reco_id == "SBC015": + lines = lines.replace( + "243.055\t244.080\tKEN:\t(H)] the little,", + "243.465\t244.670\tKEN:\t(H)] the little,", + ) + lines = lines.replace("\u0000urch things.", "church things.") + lines = lines.replace("2(H]=2", "2(H)=2") + lines = lines.replace(" 0.000000e+00", "e") + lines = lines.replace("0m=,", "um=,") + lines = lines.replace("0eople", "people") + lines = lines.replace("0id", "did") + lines = lines.replace("X 0ne %tho", "X uh line %tho") + lines = lines.replace("and 0t [was]", "and it [was]") + lines = lines.replace("0t was like", "it was like") + elif reco_id == "SBC016": + lines = lines.replace("/sed ai/", "sed ai") + elif reco_id == "SBC017": + lines = lines.replace("a\tand names the] na=me,", "and names the] na=me,") + lines = lines.replace(" 0.000000e+00", "e") + lines = lines.replace("[2I mean2", "[2I mean2]") + lines = lines.replace("no2.", "no.") + lines = lines.replace("0rganisms", "organisms") + lines = lines.replace("0ttle", "little") + elif reco_id == "SBC018": + lines = lines.replace("0f", "if") + elif reco_id == "SBC019": + lines = lines.replace("cello_(/cheller/)", "cheller") + lines = lines.replace("(sigh)", "(SIGH)") + lines = lines.replace(" Mo=m", "]", "[]") + lines = lines.replace("5]", "X>5]") + lines = lines.replace("0nly", "uh only") + lines = lines.replace("[50r5]", "[5Or5]") + elif reco_id == "SBC024": + lines = lines.replace(" >ENV: ", ">ENV:\t") + lines = lines.replace(" 0.000000irst", "First") + lines = lines.replace("2[cause", "[2cause") + lines = lines.replace(" 0oes", "does") + lines = lines.replace("0id]", "did]") + elif reco_id == "SBC025": + lines = lines.replace("", "<@ Oh[2= @>") + lines = lines.replace(" 0.000000", " ") + lines = lines.replace("i 0f", "i- if") + lines = lines.replace("0f we", "if we") + lines = lines.replace("th- 0t's", "th- that's") + lines = lines.replace("0t's", "it's") + lines = lines.replace("0f", "if") + elif reco_id == "SBC029": + lines = lines.replace("96.230\t98.240\t>ENV: ", "96.230\t98.240\t>ENV:\t") + lines = lines.replace("(H )", "(H)") + lines = lines.replace("<0h=,", "<% Oh=,") + lines = lines.replace("knowX>]", "know X>]") + lines = lines.replace("0verheating", "overheating") + elif reco_id == "SBC030": + lines = lines.replace("DANNY", "BRADLEY") + lines = lines.replace("AUD:\tYes", "X:\tYes") + elif reco_id == "SBC034": + lines = lines.replace("13548.02 ", "1354.802") + elif reco_id == "SBC036": + lines = lines.replace( + "1558.463\t1558.906\t\t[thought he was,", + "1558.906\t1558.923\t\t[thought he was,", + ) + elif reco_id == "SBC038": + lines = lines.replace("AUD:\t... What's", "X_2:\t... What's") + lines = lines.replace("AUD:\t... U", "X_3:\t... U") + lines = lines.replace("AUD:\t... How far", "X_2:\t... How far") + lines = lines.replace("AUD:\t", "") + lines = lines.replace("ANNETTE", "ANETTE") + elif reco_id == "SBC048": + lines = lines.replace("<@in San[2ta", "<@ in San[2ta") + elif reco_id == "SBC052": + lines = lines.replace("~Janine\t said", "~Janine said") + elif reco_id == "SBC054": + lines = lines.replace("", "") + lines = lines.replace("AUD:\tX", "X:\tX") + lines = lines.replace("AUD:\t") + lines = lines.replace("sensei", "") + lines = lines.replace("ippon", "Ippon") + lines = lines.replace("Ippon", "") + lines = re.sub(r"gi([^a-z])", r"\1", lines) + lines = re.sub(r"Makikomi([^-])", r"\1", lines) + lines = lines.replace("Hane-goshi", "") + lines = lines.replace("Sode-makikomi", "") + lines = lines.replace("shiai", "") + lines = lines.replace("randori", "") + lines = re.sub(r"Sode([^-])", r"\1", lines) + lines = lines.replace("Ukemi", "") + lines = lines.replace("Ha-jime", "") + lines = lines.replace("Ude-garami", "") + lines = lines.replace("Hane-uchi-mata", "") + lines = lines.replace("Uchi-", "Uchi-mata") + lines = lines.replace("Uchi-mata", "") + lines = lines.replace("Hande-maki- \1", lines) + lines = lines.replace("%Sode-maki[komi]", "") + lines = lines.replace("Tsuri-komi", "") + lines = lines.replace("Uchi-komi", "") + lines = lines.replace("O-uchi", "") + lines = lines.replace("Goshi", "") + lines = lines.replace("Uchi]-mata", "") + lines = lines.replace("Komi", "") + lines = lines.replace("Tani-otoshi", "") + lines = lines.replace("Hane-maki][2komi=", "") + lines = lines.replace("Makikomi-waza", "") + lines = lines.replace("Seoi", "") + lines = lines.replace("uke", "") + elif reco_id == "SBC059": + lines = lines.replace("[]", "hour[6=6] F>") + + spk_buffer = "" + lang_buffer = "English" + for line in lines.split("\n"): + #### Transcript fixes + if line == "77.200\t77.540 :\t(H)": + continue + if line.startswith("000000000 000000000 ") or line.startswith("0.00 0.00"): + continue + if line.startswith("\t"): + line.lstrip("\t") + if "and in his pamphlet the Liber Arbetrio" in line: + continue + + line = line.strip() + line = re.sub(r" +", " ", line) + line = re.sub(r"\t+", "\t", line) + fields = line.strip().split("\t") + if len(fields) == 4: + spk_field, raw_trans = fields[2:] + start, end = [float(time.rstrip()) for time in fields[:2]] + elif len(fields) == 3: + if len(fields[0].rstrip().split(" ")) > 1: + spk_field, raw_trans = fields[1:] + start, end = [float(time) for time in fields[0].split(" ")[:2]] + raw_trans = fields[-1] + else: + start, end = [float(time.rstrip()) for time in fields[:2]] + spk_field_candidate = fields[2].split(" ")[0] + if re.fullmatch(r"[A-Z]+:", spk_field_candidate): + spk_field = spk_field_candidate + raw_trans = " ".join(fields[2].split(" ")[1:]) + else: + spk_field = "" + raw_trans = fields[2] + elif len(fields) == 2: + timesish = fields[0].rstrip().split(" ") + if len(timesish) == 1: + continue + start, end = [float(time) for time in timesish[:2]] + if len(timesish) > 2: + spk_field = timesish[2] + raw_trans = fields[1] + else: + spk_field_candidate = fields[1].split(" ")[0] + if re.fullmatch(r"[A-Z]+:", spk_field_candidate): + spk_field = spk_field_candidate + raw_trans = " ".join(fields[1].split(" ")[1:]) + else: + spk_field = "" + raw_trans = fields[1] + else: + split = line.split(" ") + if re.fullmatch(r"[0-9]+\.[0-9]+", split[0]) and re.fullmatch( + r"[0-9]+\.[0-9]+", split[1] + ): + start, end = [float(time.rstrip()) for time in split[:2]] + if re.fullmatch(r"[A-Z]+:", split[2]): + spk_field = split[2] + raw_trans = " ".join(split[3:]) + else: + spk_field = "" + raw_trans = " ".join(split[2:]) + else: + continue + + #### Transcript fixes + if raw_trans == "[2ENV", "ENV", ">MAC", ">DOG", ">HORSE", ">CAT", ">BABY"]: + continue + elif spk_field == "#READ": + spk_field = "WALT" + + if spk_field: + spk_field = re.sub(r"^[^A-Z]", "", spk_field) + spk_buffer = spk_field + + utt_id = f"{reco_id}_{int(start*1000):07}_{int(end*1000):07}_{spk_buffer}" + + text, lang_tag = _parse_raw_transcript(raw_trans) + + if "l" in lang_tag: + for _ in range(lang_tag.count("l")): + new_lang = next(lang_iterators[reco_id]) + if "c" in lang_tag: + lang_buffer = f"English-{new_lang}" + else: + lang_buffer = new_lang + elif "c" in lang_tag: + lang_buffer = f"English-{lang_buffer.split('-')[-1]}" + + spk_key = reco_id + "_" + spk_buffer + if spk_key not in spk2glob_dict and reco_id != "SBC021": + spk2gen_dict[spk_key] = None + spk2glob_dict[spk_key] = dummy_spk_iterator.next(spk_key) + + if spk_key in spk2glob_dict: + speaker = spk2glob_dict[spk_key] + gender = spk2gen_dict[spk_key] + else: + speaker = dummy_spk_iterator.next(spk_key) + gender = None + + if re.search(r"[A-Za-z]", text): + supervisions.append( + SupervisionSegment( + id=utt_id, + recording_id=reco_id, + start=start, + duration=end - start, + channel=[0, 1], + text=text, + language=lang_buffer, + speaker=speaker, + gender=gender, + ) + ) + + if lang_tag: + if lang_tag[-1] == "r": + lang_buffer = "English" + if lang_tag[-1] == "l": + lang_buffer = lang_buffer.split("-")[-1] + + return supervisions + + +def _parse_raw_transcript(transcript: str): + + transcript = transcript.replace("0h", "oh") + transcript = transcript.replace("s@so", "s- so") + transcript = transcript.replace("la@ter", "later") + transcript = transcript.replace("you@.", "you @.") + transcript = transcript.replace("[N=]", "N") + transcript = transcript.replace("[2C2]=", "C") + transcript = transcript.replace("[MM=]", "MM") + transcript = transcript.replace("[I=]", "I") + + transcript = transcript.replace("(YELL)", "") + + transcript = transcript.replace("_", "-") + + transcript = transcript.replace("=", "") + transcript = transcript.replace("%", "") + + # Process overlapped UNKs before they get removed by the following step + transcript = re.sub(r"\[([2-9]?)([A-Z])+\1\]", r"\2", transcript) + + # Paired parenthetical/bracket annotation remover + paren_matches = re.findall(r"\([^a-z@ ]*\)", transcript) + for paren_match in paren_matches: + transcript = transcript.replace( + paren_match, re.sub(r"[^\[\]]", "", paren_match) + ) + brack_matches = re.findall(r"\[[^a-z@ ]+\]", transcript) + for brack_match in brack_matches: + transcript = transcript.replace( + brack_match, re.sub(r"[^\(\)]", "", brack_match) + ) + + transcript = re.sub(r"<<[^a-z@ ]+>>", "", transcript) + transcript = re.sub(r"<<[^a-z@ ]+", "", transcript) + transcript = re.sub(r"[^a-z@ ]+>>", "", transcript) + + transcript = re.sub(r"<[^a-z@ ]+>", "", transcript) + transcript = re.sub(r"<[^a-z2 ]*[^2 ]([ <])", r"\1", transcript) + transcript = re.sub(r"([ >])[^a-z2 ]*[^a-z 2]>", r"\1", transcript) + + transcript = re.sub(r"\[[2-9]?", "", transcript) + transcript = re.sub(r"[2-9]?\]", "", transcript) + + transcript = transcript.replace("(Hx)", " ") + transcript = transcript.replace("(hx)", " ") + transcript = transcript.replace("(@Hx)", "@") + + transcript = transcript.replace("(COUGH COUGH)", " ") + transcript = transcript.replace("(SNIFF", "") + + transcript = transcript.replace("(", "") + transcript = transcript.replace(")", "") + + transcript = transcript.replace("< ", " ") + transcript = transcript.replace(" >", " ") + + transcript = re.sub(r"[^A-Za-z-]-+", "", transcript) + transcript = re.sub(r"\.\.+", "", transcript) + + transcript = transcript.replace("+", "") + transcript = transcript.replace("&", "") + transcript = transcript.replace("#", "") + transcript = transcript.replace("*", "") + + transcript = re.sub(r"!([A-Za-z])", r"\1", transcript) + + # Deal with extra white space + transcript = re.sub(r" +", " ", transcript) + + # Merge X's + transcript = re.sub(r"X+", "X", transcript) + + # Parse laughter + transcript = transcript.replace("on@,", "on @,") + transcript = re.sub(r"([a-z-])@([a-z])", r"\1\2", transcript) + transcript = re.sub(r"@+", "@", transcript) + transcript = re.sub(r"(^| )@([^ ])", r" @ \2", transcript) + transcript = re.sub(r"([^ ])@( |$)", r"\1 @ ", transcript) + transcript = transcript.replace("@ @", "@").replace("@ @", "@") + + transcript = re.sub(r"(^| )X([ ,.?']|$)", r"\1\2", transcript) + transcript = re.sub(r"(^| )X([ ,.?']|$)", r"\1\2", transcript) + transcript = re.sub(r"X-($| )", r"\1", transcript) + + transcript = re.sub(r"^ ", "", transcript) + transcript = re.sub(r" $", "", transcript) + + transcript = transcript.replace(" .", ".") + transcript = transcript.replace(" ,", ",") + transcript = transcript.replace(" ?", "?") + + transcript = re.sub(r"^\. ", "", transcript) + transcript = re.sub(r"^\.$", "", transcript) + + if ( + len(transcript.split(" 1 + and re.search(r"[A-Za-z]", transcript.split("")) > 1 + and re.search(r"[A-Za-z]", transcript.split("L2>")[-1]) + ): + lang_tag = "c" + else: + lang_tag = "" + + transcript = transcript.replace("@", "") + transcript = transcript.replace("", "") + + if "L2" in transcript: + lang_tag = lang_tag + re.sub( + r"()(?!.*()).*$", + r"\1", + re.sub(r".*?()", r"\1", transcript), + ) + lang_tag = lang_tag.replace("", "r") + + # We choose to leave the language tags in, but uncommenting this would remove them. + # transcript = transcript.replace("", "") + + return transcript, lang_tag From 43aa2d0e0bd34a0d4f29019b19333bc5fb90e7b1 Mon Sep 17 00:00:00 2001 From: Matthew Maciejewski Date: Thu, 18 Jan 2024 16:01:54 -0500 Subject: [PATCH 02/10] transcript fixes --- lhotse/recipes/sbcsae.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py index f855e2d01..853993e94 100644 --- a/lhotse/recipes/sbcsae.py +++ b/lhotse/recipes/sbcsae.py @@ -262,6 +262,8 @@ def _filename_to_supervisions(filename: Path, spk2gen_dict: dict, spk2glob_dict: lines = lines.replace("(YAWN0", "(YAWN)") elif reco_id == "SBC008": lines = lines.replace("[", "[") + elif reco_id == "SBC010": + lines = lines.replace("366.87 366.87", "366.16 366.87") elif reco_id == "SBC012": lines = lines.replace( "\n".join(["807.02 807.92\tFRANK: \t.. Mhm."] * 2), @@ -301,6 +303,10 @@ def _filename_to_supervisions(filename: Path, spk2gen_dict: dict, spk2glob_dict: lines = lines.replace("0ttle", "little") elif reco_id == "SBC018": lines = lines.replace("0f", "if") + lines = lines.replace( + "129.916\t130.324\tLINDSEY:\tYeah.\n129.915\t130.325\t\t[Mhm.]\n", + "129.915\t130.325\tLINDSEY:\t[Mhm.] Yeah.\n", + ) elif reco_id == "SBC019": lines = lines.replace("cello_(/cheller/)", "cheller") lines = lines.replace("(sigh)", "(SIGH)") From 80e0a33e50d5b7b8ec9cf73ab6cedfc8c43aa193 Mon Sep 17 00:00:00 2001 From: Matthew Maciejewski Date: Fri, 26 Jan 2024 16:41:28 -0500 Subject: [PATCH 03/10] added SBCSAE download --- lhotse/recipes/__init__.py | 2 +- lhotse/recipes/sbcsae.py | 112 ++++++++++++++++++++++++++++++++++++- 2 files changed, 111 insertions(+), 3 deletions(-) diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py index 2fc7a6f0b..2b38cba61 100644 --- a/lhotse/recipes/__init__.py +++ b/lhotse/recipes/__init__.py @@ -63,7 +63,7 @@ from .nsc import prepare_nsc from .peoples_speech import prepare_peoples_speech from .rir_noise import download_rir_noise, prepare_rir_noise -from .sbcsae import prepare_sbcsae +from .sbcsae import download_sbcsae, prepare_sbcsae from .speechcommands import download_speechcommands, prepare_speechcommands from .spgispeech import download_spgispeech, prepare_spgispeech from .stcmds import download_stcmds, prepare_stcmds diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py index 853993e94..01db3a187 100644 --- a/lhotse/recipes/sbcsae.py +++ b/lhotse/recipes/sbcsae.py @@ -23,6 +23,7 @@ """ import logging import re +import zipfile from pathlib import Path from typing import Dict, Iterable, List, Optional, Sequence, Union @@ -31,6 +32,44 @@ TALKBANK_MP3_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/" TALKBANK_WAV_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/0wav/" +UCSB_TRANSCRIPT_URL = "https://www.linguistics.ucsb.edu/sites/secure.lsit.ucsb.edu.ling.d7/files/sitefiles/research/SBC/SBCorpus.zip" +UCSB_CHAT_URL = "https://www.linguistics.ucsb.edu/sites/secure.lsit.ucsb.edu.ling.d7/files/sitefiles/research/SBC/SBCSAE_chat.zip" +UCSB_METADATA_URL = "https://www.linguistics.ucsb.edu/sites/secure.lsit.ucsb.edu.ling.d7/files/sitefiles/research/SBC/metadata.zip" +LDC_DOC_ROOT_URL = "https://catalog.ldc.upenn.edu/docs/" +LDC_DOCS = { + "LDC2000S85": [ + "segment.tbl", + "segment.txt", + "speaker.tbl", + "speaker.txt", + ], + "LDC2003S06": [ + "annotations.txt", + "file.tbl", + "segment.tbl", + "segment.txt", + "segment_summaries.txt", + "speaker.tbl", + "speaker.txt", + "table.txt", + ], + "LDC2004S10": [ + "annotations.txt", + "file.tbl", + "segment.tbl", + "segment.txt", + "segment_summaries.txt", + "speaker.tbl", + "speaker.txt", + "table.txt", + ], + "LDC2005S25": [ + "segment.tbl", + "segment.txt", + "speaker.doc", + "speaker.tbl", + ], +} lang_iterators = { "SBC004": iter(["Spanish"] * 17), @@ -69,6 +108,7 @@ def next(self, spk="SBCXXX_X"): def download_sbcsae( target_dir: Pathlike = ".", download_mp3: Optional[bool] = False, + force_download: Optional[bool] = False, ) -> Path: """ Download the dataset. Due to availability/broken link issues, this downloads @@ -84,10 +124,78 @@ def download_sbcsae( corpus_dir.mkdir(parents=True, exist_ok=True) completed_detector = target_dir / ".sbcsae_completed" - if completed_detector.is_fil(): + if completed_detector.is_file(): logging.info(f"Skipping download because {completed_detector} exists.") return corpus_dir - return "FALSE" + + # Download audio + wav_dir = corpus_dir / "WAV" + mp3_dir = corpus_dir / "MP3" + wav_dir.mkdir(parents=True, exist_ok=True) + mp3_dir.mkdir(parents=True, exist_ok=True) + for i in range(1, 61): + session = f"{i:02d}" + wav_path = wav_dir / ("SBC0" + session + ".wav") + resumable_download( + TALKBANK_WAV_ROOT_URL + session + ".wav", + filename=wav_path, + force_download=force_download, + ) + if download_mp3: + mp3_path = mp3_dir / ("SBC0" + session + ".mp3") + resumable_download( + TALKBANK_MP3_ROOT_URL + session + ".mp3", + filename=mp3_path, + force_download=force_download, + ) + + # Download annotations + transcript_zip = corpus_dir / "TRN.zip" + resumable_download( + UCSB_TRANSCRIPT_URL, filename=transcript_zip, force_download=force_download + ) + with zipfile.ZipFile(transcript_zip) as f: + f.extractall(path=corpus_dir) + + chat_zip = corpus_dir / "CHAT.zip" + resumable_download(UCSB_CHAT_URL, filename=chat_zip, force_download=force_download) + target_chat_dir = corpus_dir / "CHAT" + if target_chat_dir.is_dir(): + if not any(target_chat_dir.iterdir()): + target_chat_dir.rmdir() + elif force_download: + for item in target_chat_dir.iterdir(): + item.unlink() + target_chat_dir.rmdir() + else: + with zipfile.ZipFile(chat_zip) as f: + f.extractall(path=corpus_dir) + chat_dir = corpus_dir / "SBCSAE" + chat_dir.rename(corpus_dir / "CHAT") + + metadata_zip = corpus_dir / "metadata.zip" + resumable_download( + UCSB_METADATA_URL, filename=metadata_zip, force_download=force_download + ) + metadata_dir = corpus_dir / "metadata" + metadata_dir.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(metadata_zip) as f: + f.extractall(path=metadata_dir) + + doc_dir = corpus_dir / "documentation" + doc_dir.mkdir(parents=True, exist_ok=True) + for LDC_split in LDC_DOCS: + LDC_dir = doc_dir / LDC_split + LDC_dir.mkdir(parents=True, exist_ok=True) + for doc_file in LDC_DOCS[LDC_split]: + doc_file_url = LDC_DOC_ROOT_URL + LDC_split + "/" + doc_file + resumable_download( + doc_file_url, filename=LDC_dir / doc_file, force_download=force_download + ) + + completed_detector.touch() + + return corpus_dir def prepare_sbcsae( From cb9b0dd1136cdc03e7ee5f8fce766051324d5ea5 Mon Sep 17 00:00:00 2001 From: Matthew Wiesner Date: Fri, 15 Mar 2024 15:52:55 -0400 Subject: [PATCH 04/10] Updates sbcsae to properly process mono_channel audio and adds speaker origin as geolocations for speakers --- lhotse/recipes/sbcsae.py | 115 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 111 insertions(+), 4 deletions(-) diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py index 01db3a187..7092f76c0 100644 --- a/lhotse/recipes/sbcsae.py +++ b/lhotse/recipes/sbcsae.py @@ -29,7 +29,8 @@ from lhotse import Recording, RecordingSet, SupervisionSegment, SupervisionSet from lhotse.utils import Pathlike, resumable_download - +from lhotse import fix_manifests +from tqdm import tqdm TALKBANK_MP3_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/" TALKBANK_WAV_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/0wav/" UCSB_TRANSCRIPT_URL = "https://www.linguistics.ucsb.edu/sites/secure.lsit.ucsb.edu.ling.d7/files/sitefiles/research/SBC/SBCorpus.zip" @@ -71,6 +72,7 @@ ], } + lang_iterators = { "SBC004": iter(["Spanish"] * 17), "SBC006": iter(["French"] * 2), @@ -90,6 +92,32 @@ } +# These corrections to the participant metadata were needed to get geolocations +# from the geopy package. +annotation_corrections = { + "metro St.L. IL": "Saint Louis MO", # Use the MO side of the city + "middle Wes MO": "Missouri", # Just use the state location + "S.E.Texas TX": "South East Texas", # The geo package seems to parse this + "South Alabama mostly AL": "Andalusia Alabama", # Arbitrarily chosen nearby town + "South FL": "South Bay Florida", # Arbitrarily chosen nearby town + "Walnut Cre CA": "Walnut Creek CA", # Spelling error + "San Leandr CA": "San Leandro CA", + "Boston/Santa Fe MA/NM": "Boston/Santa Fe\tMA/NM", # Handle this specially + "Boston/New Mexico MA/NM": "Boston/Santa Fe\tMA/NM", + "Millstad IL": "Millstadt IL", # Spelling error + "Cleveland/San Francisco OH/CA": "Cleveland/San Fransisco\tOH/CA", # Handle specially + "Jamesville WI": "Janesville WI", # Spelling error + "Falls Church/Albuquerque VA/NM": "Falls Church/Albuquerque\tVA/NM", # Handle specially + "Southern Florida": "South Bay Florida", # Arbitarily chosen nearby town + "Massachusetts MA": "Massachusetts", + "New Zealand n/a": "New Zealand", + "French n/a": "France", +} + + +bad_stereo = ["SBC020","SBC021","SBC027","SBC028"] + + class Dummy_Spk_Iterator: def __init__(self): self.ind = 213 @@ -229,16 +257,40 @@ def prepare_sbcsae( doc_dir = corpus_dir / "documentation" spk2gen_dict, spk2glob_dict = generate_speaker_map_dicts(doc_dir) - + spk_coords = generate_geolocations(corpus_dir, spk2glob_dict) supervisions = [] trn_dir = corpus_dir / "TRN" - for p in trn_dir.glob("*.trn"): + for p in tqdm(list(trn_dir.glob("*.trn")), "Collecting and normalizing transcripts ..."): for supervision in _filename_to_supervisions(p, spk2gen_dict, spk2glob_dict): supervisions.append(supervision) if len(supervisions) == 0: logging.warning(f"No supervisions found in {trn_dir}") - supervisions = SupervisionSet.from_segments(supervisions) + + supervisions_ = [] + for s in supervisions: + # A final check against 0 duration segments though this should not + # occur + if s.duration < 0.02: + s_ = s.pad(pad=0.02) + else: + s_ = s + if s_.speaker in spk_coords: + s_.custom = { + 'lat': spk_coords[s.speaker][0][0], + 'lon': spk_coords[s.speaker][0][1], + } + + if ( + not isinstance(recordings[s.recording_id].channel_ids, list) or + len(recordings[s.recording_id].channel_ids) < 2 or + s.recording_id in bad_stereo + ): + s_.channel = recordings[s.recording_id].channel_ids[0] + supervisions_.append(s_) + + supervisions = SupervisionSet.from_segments(supervisions_) + recordings, supervisions = fix_manifests(recordings, supervisions) if output_dir is not None: if isinstance(output_dir, str): @@ -252,6 +304,61 @@ def prepare_sbcsae( return manifests +def generate_geolocations(corpus: Path, spk2glob_dict: dict): + if not is_module_available("geopy"): + raise ImportError( + "geopy package not found. Please install..." " (pip install geopy)" + ) + else: + from geopy.geocoders import Nominatim + from geopy import geocoders + + speakers = corpus.rglob("documentation/LDC*/speaker.tbl") + # This geolocator object is repsonsible for generating a + # latitiude and longitude from a textual description of a location, i.e., + # CHICAGO IL --> (41,-87) + geolocator = Nominatim(user_agent='myapplication') + spk_coords = {} + for spk in tqdm(list(speakers), "Generating speaker geolocations..."): + with open(spk) as f: + for l in f: + vals = l.strip().split(",") + if len(vals) < 5: + continue + # Check non-empty + empty_hometown = vals[4] in ("", "?") + empty_state = vals[5] in ("", "?") + if empty_hometown and not empty_state: + loc = vals[5] + ", United States" + elif not empty_hometown: + orig_loc = vals[4] + " " + vals[5] + loc = annotation_corrections.get(orig_loc, orig_loc) + else: + continue + if "/" in loc: + try: + hometowns, states = loc.split("\t", 1) + hometowns = hometowns.split("/") + states = states.split("/") + coords = [] + for h, s in zip(hometowns, states): + coords.append(geolocator.geocode(f"{h} {s}", timeout=None)[1]) + except ValueError: + states, country = loc.split(",", 1) + coords = [] + for s in states.split("/"): + coords.append(geolocator.geocode(f"{s}, {country}", timeout=None)[1]) + else: + coords = [geolocator.geocode(loc, timeout=None)[1]] + spk_coords[vals[0]] = coords + spknum2spk_name = {n.split("_")[0]: n for s, n in spk2glob_dict.items()} + spk_coords_ = {} + for s in spk_coords: + if s in spknum2spk_name: + spk_coords_[spknum2spk_name[s]] = spk_coords[s] + return spk_coords_ + + def generate_speaker_map_dicts(doc_dir: Path): spk2gen_dict = dict() spk2glob_dict = dict() From 67d839b7fc5f49038ad0d082e55a7f89391aa8fc Mon Sep 17 00:00:00 2001 From: Matthew Wiesner Date: Fri, 15 Mar 2024 16:40:15 -0400 Subject: [PATCH 05/10] Fixes a few 0-width segments by adding 0.02 s of padding --- lhotse/recipes/sbcsae.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py index 7092f76c0..1e975e450 100644 --- a/lhotse/recipes/sbcsae.py +++ b/lhotse/recipes/sbcsae.py @@ -28,7 +28,9 @@ from typing import Dict, Iterable, List, Optional, Sequence, Union from lhotse import Recording, RecordingSet, SupervisionSegment, SupervisionSet -from lhotse.utils import Pathlike, resumable_download +from lhotse.utils import ( + Pathlike, resumable_download, is_module_available, fastcopy, +) from lhotse import fix_manifests from tqdm import tqdm TALKBANK_MP3_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/" @@ -257,7 +259,7 @@ def prepare_sbcsae( doc_dir = corpus_dir / "documentation" spk2gen_dict, spk2glob_dict = generate_speaker_map_dicts(doc_dir) - spk_coords = generate_geolocations(corpus_dir, spk2glob_dict) + #spk_coords = generate_geolocations(corpus_dir, spk2glob_dict) supervisions = [] trn_dir = corpus_dir / "TRN" for p in tqdm(list(trn_dir.glob("*.trn")), "Collecting and normalizing transcripts ..."): @@ -269,12 +271,18 @@ def prepare_sbcsae( supervisions_ = [] for s in supervisions: - # A final check against 0 duration segments though this should not - # occur if s.duration < 0.02: - s_ = s.pad(pad=0.02) - else: + # Just pad with a minimum 0.02 duration + s_reco = recordings[s.recording_id] + new_start = max(0, s.start - 0.01) + s_ = fastcopy( + s, + start=new_start, + duration=min(new_start + 0.02, s_reco.duration), + ) + else: s_ = s + if s_.speaker in spk_coords: s_.custom = { 'lat': spk_coords[s.speaker][0][0], From dade60b37feb2e6d40777f3fece3f553952b802c Mon Sep 17 00:00:00 2001 From: Matthew Maciejewski Date: Tue, 30 Jul 2024 16:57:44 -0400 Subject: [PATCH 06/10] small fix --- lhotse/bin/modes/recipes/sbcsae.py | 18 ++- lhotse/recipes/sbcsae.py | 249 +++++++++++------------------ 2 files changed, 105 insertions(+), 162 deletions(-) diff --git a/lhotse/bin/modes/recipes/sbcsae.py b/lhotse/bin/modes/recipes/sbcsae.py index 23f36e09b..d447c8db2 100644 --- a/lhotse/bin/modes/recipes/sbcsae.py +++ b/lhotse/bin/modes/recipes/sbcsae.py @@ -12,26 +12,34 @@ @prepare.command(context_settings=dict(show_default=True)) @click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True)) @click.argument("output_dir", type=click.Path()) +@click.option( + "--geolocation", + type=bool, + is_flag=True, + default=False, + help="Include geographic coordinates of speakers' hometowns in the manifests.", +) def sbcsae( corpus_dir: Pathlike, output_dir: Pathlike, + geolocation: bool, ): """SBCSAE data preparation.""" - prepare_sbcsae(corpus_dir, output_dir=output_dir) + prepare_sbcsae(corpus_dir, output_dir=output_dir, geolocation=geolocation) @download.command(context_settings=dict(show_default=True)) @click.argument("target_dir", type=click.Path()) @click.option( - "--download-mp3", + "--force-download", type=bool, is_flag=True, default=False, - help="Download the mp3 copy of the audio as well as wav.", + help="Force download.", ) def sbcsae( target_dir: Pathlike, - download_mp3: Optional[bool] = False, + force_download: bool, ): """SBCSAE download.""" - download_sbcsae(target_dir, download_mp3=download_mp3) + download_sbcsae(target_dir, force_download=force_download) diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py index 1e975e450..aba12d594 100644 --- a/lhotse/recipes/sbcsae.py +++ b/lhotse/recipes/sbcsae.py @@ -19,60 +19,45 @@ (UMass, Boston). For the publication of Parts 3 and 4, the authors are John W. Du Bois and Robert Englebretson. -TODO: detail on splits and such +If you use the corpus or our data preparation scripts, please cite the following: +@misc{dubois_2005, + author={Du Bois, John W. and Chafe, Wallace L. and Meyer, Charles and Thompson, Sandra A. and Englebretson, Robert and Martey, Nii}, + year={2000--2005}, + title={{S}anta {B}arbara corpus of spoken {A}merican {E}nglish, {P}arts 1--4}, + address={Philadelphia}, + organization={Linguistic Data Consortium}, +} +@inproceedings{maciejewski24_interspeech, + author={Matthew Maciejewski and Dominik Klement and Ruizhe Huang and Matthew Wiesner and Sanjeev Khudanpur}, + title={Evaluating the {Santa Barbara} Corpus: Challenges of the Breadth of Conversational Spoken Language}, + year=2024, + booktitle={Proc. Interspeech 2024} +} """ import logging import re -import zipfile +import tarfile from pathlib import Path -from typing import Dict, Iterable, List, Optional, Sequence, Union +from typing import Dict, Optional, Union + +from tqdm import tqdm -from lhotse import Recording, RecordingSet, SupervisionSegment, SupervisionSet +from lhotse import ( + Recording, + RecordingSet, + SupervisionSegment, + SupervisionSet, + fix_manifests, +) from lhotse.utils import ( - Pathlike, resumable_download, is_module_available, fastcopy, + Pathlike, + fastcopy, + is_module_available, + resumable_download, + safe_extract, ) -from lhotse import fix_manifests -from tqdm import tqdm -TALKBANK_MP3_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/" -TALKBANK_WAV_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/0wav/" -UCSB_TRANSCRIPT_URL = "https://www.linguistics.ucsb.edu/sites/secure.lsit.ucsb.edu.ling.d7/files/sitefiles/research/SBC/SBCorpus.zip" -UCSB_CHAT_URL = "https://www.linguistics.ucsb.edu/sites/secure.lsit.ucsb.edu.ling.d7/files/sitefiles/research/SBC/SBCSAE_chat.zip" -UCSB_METADATA_URL = "https://www.linguistics.ucsb.edu/sites/secure.lsit.ucsb.edu.ling.d7/files/sitefiles/research/SBC/metadata.zip" -LDC_DOC_ROOT_URL = "https://catalog.ldc.upenn.edu/docs/" -LDC_DOCS = { - "LDC2000S85": [ - "segment.tbl", - "segment.txt", - "speaker.tbl", - "speaker.txt", - ], - "LDC2003S06": [ - "annotations.txt", - "file.tbl", - "segment.tbl", - "segment.txt", - "segment_summaries.txt", - "speaker.tbl", - "speaker.txt", - "table.txt", - ], - "LDC2004S10": [ - "annotations.txt", - "file.tbl", - "segment.tbl", - "segment.txt", - "segment_summaries.txt", - "speaker.tbl", - "speaker.txt", - "table.txt", - ], - "LDC2005S25": [ - "segment.tbl", - "segment.txt", - "speaker.doc", - "speaker.tbl", - ], -} + +SBCSAE_TAR_URL = "https://www.openslr.org/resources/155/SBCSAE.tar.gz" lang_iterators = { @@ -97,27 +82,27 @@ # These corrections to the participant metadata were needed to get geolocations # from the geopy package. annotation_corrections = { - "metro St.L. IL": "Saint Louis MO", # Use the MO side of the city - "middle Wes MO": "Missouri", # Just use the state location - "S.E.Texas TX": "South East Texas", # The geo package seems to parse this - "South Alabama mostly AL": "Andalusia Alabama", # Arbitrarily chosen nearby town - "South FL": "South Bay Florida", # Arbitrarily chosen nearby town - "Walnut Cre CA": "Walnut Creek CA", # Spelling error + "metro St.L. IL": "Saint Louis MO", # Use the MO side of the city + "middle Wes MO": "Missouri", # Just use the state location + "S.E.Texas TX": "South East Texas", # The geo package seems to parse this + "South Alabama mostly AL": "Andalusia Alabama", # Arbitrarily chosen nearby town + "South FL": "South Bay Florida", # Arbitrarily chosen nearby town + "Walnut Cre CA": "Walnut Creek CA", # Spelling error "San Leandr CA": "San Leandro CA", - "Boston/Santa Fe MA/NM": "Boston/Santa Fe\tMA/NM", # Handle this specially + "Boston/Santa Fe MA/NM": "Boston/Santa Fe\tMA/NM", # Handle this specially "Boston/New Mexico MA/NM": "Boston/Santa Fe\tMA/NM", - "Millstad IL": "Millstadt IL", # Spelling error - "Cleveland/San Francisco OH/CA": "Cleveland/San Fransisco\tOH/CA", # Handle specially - "Jamesville WI": "Janesville WI", # Spelling error - "Falls Church/Albuquerque VA/NM": "Falls Church/Albuquerque\tVA/NM", # Handle specially - "Southern Florida": "South Bay Florida", # Arbitarily chosen nearby town + "Millstad IL": "Millstadt IL", # Spelling error + "Cleveland/San Francisco OH/CA": "Cleveland/San Fransisco\tOH/CA", # Handle specially + "Jamesville WI": "Janesville WI", # Spelling error + "Falls Church/Albuquerque VA/NM": "Falls Church/Albuquerque\tVA/NM", # Handle specially + "Southern Florida": "South Bay Florida", # Arbitarily chosen nearby town "Massachusetts MA": "Massachusetts", "New Zealand n/a": "New Zealand", "French n/a": "France", } -bad_stereo = ["SBC020","SBC021","SBC027","SBC028"] +bad_stereo = ["SBC020", "SBC021", "SBC027", "SBC028"] class Dummy_Spk_Iterator: @@ -137,93 +122,30 @@ def next(self, spk="SBCXXX_X"): def download_sbcsae( target_dir: Pathlike = ".", - download_mp3: Optional[bool] = False, force_download: Optional[bool] = False, ) -> Path: """ - Download the dataset. Due to availability/broken link issues, this downloads - from multiple sources. + Download and untar the dataset. :param: target_dir: Pathlike, the path of the directory where the SBCSAE dataset will be downloaded. - :param: download_mp3: bool, if True download the mp3 files as well as wav. + :param force_download: bool, if True, download the archive even if it already exists. :return: The path to the directory with the data. """ target_dir = Path(target_dir) corpus_dir = target_dir / "SBCSAE" corpus_dir.mkdir(parents=True, exist_ok=True) + tar_path = target_dir / "SBCSAE.tar.gz" completed_detector = target_dir / ".sbcsae_completed" if completed_detector.is_file(): logging.info(f"Skipping download because {completed_detector} exists.") return corpus_dir - # Download audio - wav_dir = corpus_dir / "WAV" - mp3_dir = corpus_dir / "MP3" - wav_dir.mkdir(parents=True, exist_ok=True) - mp3_dir.mkdir(parents=True, exist_ok=True) - for i in range(1, 61): - session = f"{i:02d}" - wav_path = wav_dir / ("SBC0" + session + ".wav") - resumable_download( - TALKBANK_WAV_ROOT_URL + session + ".wav", - filename=wav_path, - force_download=force_download, - ) - if download_mp3: - mp3_path = mp3_dir / ("SBC0" + session + ".mp3") - resumable_download( - TALKBANK_MP3_ROOT_URL + session + ".mp3", - filename=mp3_path, - force_download=force_download, - ) - - # Download annotations - transcript_zip = corpus_dir / "TRN.zip" - resumable_download( - UCSB_TRANSCRIPT_URL, filename=transcript_zip, force_download=force_download - ) - with zipfile.ZipFile(transcript_zip) as f: - f.extractall(path=corpus_dir) - - chat_zip = corpus_dir / "CHAT.zip" - resumable_download(UCSB_CHAT_URL, filename=chat_zip, force_download=force_download) - target_chat_dir = corpus_dir / "CHAT" - if target_chat_dir.is_dir(): - if not any(target_chat_dir.iterdir()): - target_chat_dir.rmdir() - elif force_download: - for item in target_chat_dir.iterdir(): - item.unlink() - target_chat_dir.rmdir() - else: - with zipfile.ZipFile(chat_zip) as f: - f.extractall(path=corpus_dir) - chat_dir = corpus_dir / "SBCSAE" - chat_dir.rename(corpus_dir / "CHAT") - - metadata_zip = corpus_dir / "metadata.zip" - resumable_download( - UCSB_METADATA_URL, filename=metadata_zip, force_download=force_download - ) - metadata_dir = corpus_dir / "metadata" - metadata_dir.mkdir(parents=True, exist_ok=True) - with zipfile.ZipFile(metadata_zip) as f: - f.extractall(path=metadata_dir) - - doc_dir = corpus_dir / "documentation" - doc_dir.mkdir(parents=True, exist_ok=True) - for LDC_split in LDC_DOCS: - LDC_dir = doc_dir / LDC_split - LDC_dir.mkdir(parents=True, exist_ok=True) - for doc_file in LDC_DOCS[LDC_split]: - doc_file_url = LDC_DOC_ROOT_URL + LDC_split + "/" + doc_file - resumable_download( - doc_file_url, filename=LDC_dir / doc_file, force_download=force_download - ) - - completed_detector.touch() + resumable_download(SBCSAE_TAR_URL, filename=tar_path, force_download=force_download) + with tarfile.open(tar_path) as tar: + safe_extract(tar, path=corpus_dir) + completed_detector.touch() return corpus_dir @@ -231,6 +153,7 @@ def download_sbcsae( def prepare_sbcsae( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, + geolocation: Optional[bool] = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares manifest for SBCSAE dataset. @@ -240,7 +163,9 @@ def prepare_sbcsae( releases of the data. Check script comments for details if using an existing corpus download rather than Lhotse's download script. :param: output_dir: Root directory where .json manifests are stored. - :return: + :param: geolocation: Include geographic coordinates of speakers' hometowns + in the manifests. + :return: The manifests. """ # Resolve corpus_dir type if isinstance(corpus_dir, str): @@ -257,12 +182,18 @@ def prepare_sbcsae( if len(recordings) == 0: logging.warning(f"No .wav files found in {audio_dir}") - doc_dir = corpus_dir / "documentation" + doc_dir = corpus_dir / "docs" spk2gen_dict, spk2glob_dict = generate_speaker_map_dicts(doc_dir) - #spk_coords = generate_geolocations(corpus_dir, spk2glob_dict) + + spk_coords = {} + if geolocation: + spk_coords = generate_geolocations(corpus_dir, spk2glob_dict) + supervisions = [] trn_dir = corpus_dir / "TRN" - for p in tqdm(list(trn_dir.glob("*.trn")), "Collecting and normalizing transcripts ..."): + for p in tqdm( + list(trn_dir.glob("*.trn")), "Collecting and normalizing transcripts ..." + ): for supervision in _filename_to_supervisions(p, spk2gen_dict, spk2glob_dict): supervisions.append(supervision) @@ -280,23 +211,23 @@ def prepare_sbcsae( start=new_start, duration=min(new_start + 0.02, s_reco.duration), ) - else: + else: s_ = s - + if s_.speaker in spk_coords: s_.custom = { - 'lat': spk_coords[s.speaker][0][0], - 'lon': spk_coords[s.speaker][0][1], + "lat": spk_coords[s.speaker][0][0], + "lon": spk_coords[s.speaker][0][1], } if ( - not isinstance(recordings[s.recording_id].channel_ids, list) or - len(recordings[s.recording_id].channel_ids) < 2 or - s.recording_id in bad_stereo + not isinstance(recordings[s.recording_id].channel_ids, list) + or len(recordings[s.recording_id].channel_ids) < 2 + or s.recording_id in bad_stereo ): s_.channel = recordings[s.recording_id].channel_ids[0] - supervisions_.append(s_) - + supervisions_.append(s_) + supervisions = SupervisionSet.from_segments(supervisions_) recordings, supervisions = fix_manifests(recordings, supervisions) @@ -318,14 +249,14 @@ def generate_geolocations(corpus: Path, spk2glob_dict: dict): "geopy package not found. Please install..." " (pip install geopy)" ) else: - from geopy.geocoders import Nominatim from geopy import geocoders + from geopy.geocoders import Nominatim - speakers = corpus.rglob("documentation/LDC*/speaker.tbl") - # This geolocator object is repsonsible for generating a + speakers = corpus.rglob("docs/Part_*/speaker.tbl") + # This geolocator object is repsonsible for generating a # latitiude and longitude from a textual description of a location, i.e., - # CHICAGO IL --> (41,-87) - geolocator = Nominatim(user_agent='myapplication') + # CHICAGO IL --> (41,-87) + geolocator = Nominatim(user_agent="myapplication") spk_coords = {} for spk in tqdm(list(speakers), "Generating speaker geolocations..."): with open(spk) as f: @@ -350,12 +281,16 @@ def generate_geolocations(corpus: Path, spk2glob_dict: dict): states = states.split("/") coords = [] for h, s in zip(hometowns, states): - coords.append(geolocator.geocode(f"{h} {s}", timeout=None)[1]) + coords.append( + geolocator.geocode(f"{h} {s}", timeout=None)[1] + ) except ValueError: states, country = loc.split(",", 1) coords = [] for s in states.split("/"): - coords.append(geolocator.geocode(f"{s}, {country}", timeout=None)[1]) + coords.append( + geolocator.geocode(f"{s}, {country}", timeout=None)[1] + ) else: coords = [geolocator.geocode(loc, timeout=None)[1]] spk_coords[vals[0]] = coords @@ -372,8 +307,8 @@ def generate_speaker_map_dicts(doc_dir: Path): spk2glob_dict = dict() spk_num_to_reco_ids = dict() - for LDC_split in ["LDC2000S85", "LDC2003S06", "LDC2005S25"]: - filename = doc_dir / LDC_split / "segment.tbl" + for part in ["Part_1", "Part_2", "Part_4"]: + filename = doc_dir / part / "segment.tbl" for line in filename.read_text().split("\n"): if "speaker:" in line: line = line.replace(" 0", "\t0") @@ -384,8 +319,8 @@ def generate_speaker_map_dicts(doc_dir: Path): if reco_id not in spk_num_to_reco_ids[spk_num]: spk_num_to_reco_ids[spk_num].append(reco_id) - for LDC_split in ["LDC2000S85", "LDC2003S06", "LDC2005S25"]: - filename = doc_dir / LDC_split / "speaker.tbl" + for part in ["Part_1", "Part_2", "Part_4"]: + filename = doc_dir / part / "speaker.tbl" for line in filename.read_text().split("\n"): if "," not in line: continue @@ -404,9 +339,9 @@ def generate_speaker_map_dicts(doc_dir: Path): spk2gen_dict[reco + "_" + name] = gen spk2glob_dict[reco + "_" + name] = spk_num + "_" + name - for LDC_split in ["LDC2004S10"]: + for part in ["Part_3"]: seg_list = [] - filename = doc_dir / LDC_split / "segment.tbl" + filename = doc_dir / part / "segment.tbl" for line in filename.read_text().split("\n"): if "speaker:" in line: reco_id = re.sub(r"sbc0?([0-9]{3})\s.*", r"SBC\1", line) @@ -414,7 +349,7 @@ def generate_speaker_map_dicts(doc_dir: Path): seg_list.append([name, reco_id]) spk_list = [] - filename = doc_dir / LDC_split / "speaker.tbl" + filename = doc_dir / part / "speaker.tbl" for line in filename.read_text().split("\n"): if "," not in line: continue From ae9c4588dd0f8580bf42272cd1dba398d0488ce9 Mon Sep 17 00:00:00 2001 From: Dominik Klement Date: Tue, 17 Sep 2024 14:47:48 -0400 Subject: [PATCH 07/10] Add alignment export option Exports aligned supervisions along with the original supervisions with or without changing the text after manual inspections and corrections. --- lhotse/bin/modes/recipes/sbcsae.py | 22 +++- lhotse/recipes/sbcsae.py | 159 ++++++++++++++++++++++++++++- 2 files changed, 179 insertions(+), 2 deletions(-) diff --git a/lhotse/bin/modes/recipes/sbcsae.py b/lhotse/bin/modes/recipes/sbcsae.py index d447c8db2..3a89291cf 100644 --- a/lhotse/bin/modes/recipes/sbcsae.py +++ b/lhotse/bin/modes/recipes/sbcsae.py @@ -19,13 +19,33 @@ default=False, help="Include geographic coordinates of speakers' hometowns in the manifests.", ) +@click.option( + "--export-alignments", + type=bool, + is_flag=True, + default=True, + help="Export re-aligned manifests.", +) +@click.option( + "--fix-transcripts", + type=bool, + is_flag=True, + default=True, + help="Replace transcripts by the the re-aligned ones (with manual fixes applied).", +) def sbcsae( corpus_dir: Pathlike, output_dir: Pathlike, geolocation: bool, + export_alignments: bool, + fix_transcripts: bool, ): """SBCSAE data preparation.""" - prepare_sbcsae(corpus_dir, output_dir=output_dir, geolocation=geolocation) + prepare_sbcsae(corpus_dir, + output_dir=output_dir, + geolocation=geolocation, + export_alignments=export_alignments, + fix_transcripts=fix_transcripts) @download.command(context_settings=dict(show_default=True)) diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py index aba12d594..00a4f5c34 100644 --- a/lhotse/recipes/sbcsae.py +++ b/lhotse/recipes/sbcsae.py @@ -34,10 +34,13 @@ booktitle={Proc. Interspeech 2024} } """ +from copy import deepcopy +from dataclasses import dataclass import logging +from math import inf +from pathlib import Path import re import tarfile -from pathlib import Path from typing import Dict, Optional, Union from tqdm import tqdm @@ -154,6 +157,8 @@ def prepare_sbcsae( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, geolocation: Optional[bool] = False, + export_alignments: Optional[bool] = True, + fix_transcripts: Optional[bool] = True, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares manifest for SBCSAE dataset. @@ -240,6 +245,21 @@ def prepare_sbcsae( manifests = {"recordings": recordings, "supervisions": supervisions} + if export_alignments: + asr_supervisions, diar_supervisions = apply_aligned_stms(list(recordings.ids), supervisions, + change_text=fix_transcripts) + _, asr_supervisions = fix_manifests(recordings, asr_supervisions) + _, diar_supervisions = fix_manifests(recordings, diar_supervisions) + + asr_supervisions.to_file(output_dir / "sbcsae_supervisions_asr_aligned.jsonl.gz") + diar_supervisions.to_file(output_dir / "sbcsae_supervisions_diar_aligned.jsonl.gz") + + manifests = { + "asr_supervisions": asr_supervisions, + "diar_supervisions": diar_supervisions, + **manifests + } + return manifests @@ -960,3 +980,140 @@ def _parse_raw_transcript(transcript: str): # transcript = transcript.replace(" L2>", "") return transcript, lang_tag + + +@dataclass +class StmSegment: + recording_id: str + speaker: str + start: float + end: float + text: str + channel: str = "1" + + +def parse_stm_file(data: str) -> list[StmSegment]: + lines = data.split("\n") + stm_segments = [] + + for line in lines: + if not line: + continue + + fields = line.strip().split() + reco_id, channel, speaker = fields[:3] + start, end = [float(time) for time in fields[3:5]] + text = " ".join(fields[5:]) + + stm_segments.append( + StmSegment(recording_id=reco_id, speaker=speaker, start=start, end=end, text=text, channel=channel) + ) + + return stm_segments + + +def retrieve_stm_file(url) -> list[StmSegment]: + import urllib.request + + response = urllib.request.urlopen(url) + data = response.read().decode("utf-8") + + return parse_stm_file(data) + + +def norm_txt(text: str): + text = text.strip() + text = text.lower() + return text + + +def compute_iou(seg1: SupervisionSegment, seg2: StmSegment) -> float: + start = max(seg1.start, seg2.start) + end = min(seg1.end, seg2.end) + + intersection = max(0.0, end - start) + union = (seg1.end - seg1.start) + (seg2.end - seg2.start) - intersection + + return intersection / union + + +def apply_stm(recording_ids: list[str], supervisions: SupervisionSet, aligned_stm_segs: list[StmSegment], + change_text: bool = False) -> SupervisionSet: + + if not is_module_available("intervaltree"): + raise ImportError( + "intervaltree package not found. Please install..." " (pip install intervaltree)" + ) + else: + from intervaltree import IntervalTree + + if not is_module_available("jiwer"): + raise ImportError( + "jiwer package not found. Please install..." " (pip install jiwer==3.0.4)" + ) + else: + from jiwer import cer + + sset = deepcopy(supervisions) + + per_rec_its = {} + for rid in recording_ids: + per_rec_its[rid] = IntervalTree() + for stm_seg in tqdm(aligned_stm_segs, desc="Building interval tree..."): + per_rec_its[stm_seg.recording_id][stm_seg.start:stm_seg.end] = stm_seg + + for s in tqdm(sset, desc="Applying STM..."): + # We need to find the closest and best-matching segment. + # Some labeled segments were misplaced a lot and fixed by manual post-processing. + # Hence, in order to find a good match, we tuned collar value to find all matches. + # Example: 451 seconds, SBC027 recording. + collar = 2.0 + matching_segments = list(filter(lambda x: x.data.speaker == s.speaker, per_rec_its[s.recording_id][s.start-collar:s.end+collar])) + # Alignments used slightly different speaker IDs for UNK speakers, so we relax the speaker ID matching. + if not matching_segments: + matching_segments = per_rec_its[s.recording_id][s.start-collar:s.end+collar] + + best_cer = inf + best_cer_res = None + best_matching_seg = None + best_iou = 0.0 + + for matching_seg in matching_segments: + cer_res = cer(norm_txt(s.text), norm_txt(matching_seg.data.text), return_dict=True) + cer_val = cer_res["cer"] + + if cer_val < best_cer: + best_cer = cer_val + best_cer_res = cer_res + best_matching_seg = matching_seg + best_iou = compute_iou(s, matching_seg.data) + + # There's been an update between the alignments and the lhotse recipe, so some UNK speakers have shifted IDs. + # It's enough to match the speaker names (or UNK). + if cer_val == best_cer and matching_seg.data.speaker.split("_")[1] == s.speaker.split("_")[1]: + current_iou = compute_iou(s, matching_seg.data) + if current_iou >= best_iou: + best_matching_seg = matching_seg + best_cer_res = cer_res + best_iou = current_iou + + if s.speaker.split("_")[1] == best_matching_seg.data.speaker.split("_")[1] and best_cer_res["substitutions"] == best_cer_res["deletions"] == 0 and (best_cer < 0.5 or len(s.text) < 3): + s.start = best_matching_seg.data.start + s.duration = best_matching_seg.data.end - best_matching_seg.data.start + if change_text: + s.text = best_matching_seg.data.text + + per_rec_its[s.recording_id].remove(best_matching_seg) + + return sset + + +def apply_aligned_stms(recording_ids: list[str], processed_supervisions: SupervisionSet, + change_text: bool = False) -> tuple[SupervisionSet, SupervisionSet]: + aligned_for_asr_stm = retrieve_stm_file("https://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_asr.stm") + aligned_for_diar_stm = retrieve_stm_file("https://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_diar.stm") + + asr_sup = apply_stm(recording_ids, processed_supervisions, aligned_for_asr_stm, change_text) + diar_sup = apply_stm(recording_ids, processed_supervisions, aligned_for_diar_stm, change_text) + + return asr_sup, diar_sup From be64e93ffe278079a9e6a16b66796d55c10492ba Mon Sep 17 00:00:00 2001 From: Matthew Maciejewski Date: Wed, 18 Sep 2024 11:39:15 -0400 Subject: [PATCH 08/10] update to cli flags and docs --- lhotse/bin/modes/recipes/sbcsae.py | 27 ++++----- lhotse/recipes/sbcsae.py | 94 +++++++++++++++++++++--------- 2 files changed, 75 insertions(+), 46 deletions(-) diff --git a/lhotse/bin/modes/recipes/sbcsae.py b/lhotse/bin/modes/recipes/sbcsae.py index 3a89291cf..d6eece516 100644 --- a/lhotse/bin/modes/recipes/sbcsae.py +++ b/lhotse/bin/modes/recipes/sbcsae.py @@ -20,32 +20,25 @@ help="Include geographic coordinates of speakers' hometowns in the manifests.", ) @click.option( - "--export-alignments", + "--omit-realignments", type=bool, is_flag=True, - default=True, - help="Export re-aligned manifests.", -) -@click.option( - "--fix-transcripts", - type=bool, - is_flag=True, - default=True, - help="Replace transcripts by the the re-aligned ones (with manual fixes applied).", + default=False, + help="Only output the original corpus segmentation without boundary improvements.", ) def sbcsae( corpus_dir: Pathlike, output_dir: Pathlike, geolocation: bool, - export_alignments: bool, - fix_transcripts: bool, + omit_realignments: bool, ): """SBCSAE data preparation.""" - prepare_sbcsae(corpus_dir, - output_dir=output_dir, - geolocation=geolocation, - export_alignments=export_alignments, - fix_transcripts=fix_transcripts) + prepare_sbcsae( + corpus_dir, + output_dir=output_dir, + geolocation=geolocation, + omit_realignments=omit_realignments, + ) @download.command(context_settings=dict(show_default=True)) diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py index 00a4f5c34..504e9f337 100644 --- a/lhotse/recipes/sbcsae.py +++ b/lhotse/recipes/sbcsae.py @@ -34,13 +34,13 @@ booktitle={Proc. Interspeech 2024} } """ +import logging +import re +import tarfile from copy import deepcopy from dataclasses import dataclass -import logging from math import inf from pathlib import Path -import re -import tarfile from typing import Dict, Optional, Union from tqdm import tqdm @@ -157,8 +157,7 @@ def prepare_sbcsae( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, geolocation: Optional[bool] = False, - export_alignments: Optional[bool] = True, - fix_transcripts: Optional[bool] = True, + omit_realignments: Optional[bool] = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares manifest for SBCSAE dataset. @@ -170,6 +169,7 @@ def prepare_sbcsae( :param: output_dir: Root directory where .json manifests are stored. :param: geolocation: Include geographic coordinates of speakers' hometowns in the manifests. + :param: omit_realignments: Only output original corpus segmentation. :return: The manifests. """ # Resolve corpus_dir type @@ -245,19 +245,24 @@ def prepare_sbcsae( manifests = {"recordings": recordings, "supervisions": supervisions} - if export_alignments: - asr_supervisions, diar_supervisions = apply_aligned_stms(list(recordings.ids), supervisions, - change_text=fix_transcripts) + if not omit_realignments: + asr_supervisions, diar_supervisions = apply_aligned_stms( + list(recordings.ids), supervisions + ) _, asr_supervisions = fix_manifests(recordings, asr_supervisions) _, diar_supervisions = fix_manifests(recordings, diar_supervisions) - asr_supervisions.to_file(output_dir / "sbcsae_supervisions_asr_aligned.jsonl.gz") - diar_supervisions.to_file(output_dir / "sbcsae_supervisions_diar_aligned.jsonl.gz") + asr_supervisions.to_file( + output_dir / "sbcsae_supervisions_asr_aligned.jsonl.gz" + ) + diar_supervisions.to_file( + output_dir / "sbcsae_supervisions_diar_aligned.jsonl.gz" + ) manifests = { "asr_supervisions": asr_supervisions, "diar_supervisions": diar_supervisions, - **manifests + **manifests, } return manifests @@ -1006,7 +1011,14 @@ def parse_stm_file(data: str) -> list[StmSegment]: text = " ".join(fields[5:]) stm_segments.append( - StmSegment(recording_id=reco_id, speaker=speaker, start=start, end=end, text=text, channel=channel) + StmSegment( + recording_id=reco_id, + speaker=speaker, + start=start, + end=end, + text=text, + channel=channel, + ) ) return stm_segments @@ -1037,12 +1049,16 @@ def compute_iou(seg1: SupervisionSegment, seg2: StmSegment) -> float: return intersection / union -def apply_stm(recording_ids: list[str], supervisions: SupervisionSet, aligned_stm_segs: list[StmSegment], - change_text: bool = False) -> SupervisionSet: +def apply_stm( + recording_ids: list[str], + supervisions: SupervisionSet, + aligned_stm_segs: list[StmSegment], +) -> SupervisionSet: if not is_module_available("intervaltree"): raise ImportError( - "intervaltree package not found. Please install..." " (pip install intervaltree)" + "intervaltree package not found. Please install..." + " (pip install intervaltree)" ) else: from intervaltree import IntervalTree @@ -1060,7 +1076,7 @@ def apply_stm(recording_ids: list[str], supervisions: SupervisionSet, aligned_st for rid in recording_ids: per_rec_its[rid] = IntervalTree() for stm_seg in tqdm(aligned_stm_segs, desc="Building interval tree..."): - per_rec_its[stm_seg.recording_id][stm_seg.start:stm_seg.end] = stm_seg + per_rec_its[stm_seg.recording_id][stm_seg.start : stm_seg.end] = stm_seg for s in tqdm(sset, desc="Applying STM..."): # We need to find the closest and best-matching segment. @@ -1068,10 +1084,17 @@ def apply_stm(recording_ids: list[str], supervisions: SupervisionSet, aligned_st # Hence, in order to find a good match, we tuned collar value to find all matches. # Example: 451 seconds, SBC027 recording. collar = 2.0 - matching_segments = list(filter(lambda x: x.data.speaker == s.speaker, per_rec_its[s.recording_id][s.start-collar:s.end+collar])) + matching_segments = list( + filter( + lambda x: x.data.speaker == s.speaker, + per_rec_its[s.recording_id][s.start - collar : s.end + collar], + ) + ) # Alignments used slightly different speaker IDs for UNK speakers, so we relax the speaker ID matching. if not matching_segments: - matching_segments = per_rec_its[s.recording_id][s.start-collar:s.end+collar] + matching_segments = per_rec_its[s.recording_id][ + s.start - collar : s.end + collar + ] best_cer = inf best_cer_res = None @@ -1079,7 +1102,9 @@ def apply_stm(recording_ids: list[str], supervisions: SupervisionSet, aligned_st best_iou = 0.0 for matching_seg in matching_segments: - cer_res = cer(norm_txt(s.text), norm_txt(matching_seg.data.text), return_dict=True) + cer_res = cer( + norm_txt(s.text), norm_txt(matching_seg.data.text), return_dict=True + ) cer_val = cer_res["cer"] if cer_val < best_cer: @@ -1090,30 +1115,41 @@ def apply_stm(recording_ids: list[str], supervisions: SupervisionSet, aligned_st # There's been an update between the alignments and the lhotse recipe, so some UNK speakers have shifted IDs. # It's enough to match the speaker names (or UNK). - if cer_val == best_cer and matching_seg.data.speaker.split("_")[1] == s.speaker.split("_")[1]: + if ( + cer_val == best_cer + and matching_seg.data.speaker.split("_")[1] == s.speaker.split("_")[1] + ): current_iou = compute_iou(s, matching_seg.data) if current_iou >= best_iou: best_matching_seg = matching_seg best_cer_res = cer_res best_iou = current_iou - if s.speaker.split("_")[1] == best_matching_seg.data.speaker.split("_")[1] and best_cer_res["substitutions"] == best_cer_res["deletions"] == 0 and (best_cer < 0.5 or len(s.text) < 3): + if ( + s.speaker.split("_")[1] == best_matching_seg.data.speaker.split("_")[1] + and best_cer_res["substitutions"] == best_cer_res["deletions"] == 0 + and (best_cer < 0.5 or len(s.text) < 3) + ): s.start = best_matching_seg.data.start s.duration = best_matching_seg.data.end - best_matching_seg.data.start - if change_text: - s.text = best_matching_seg.data.text + s.text = best_matching_seg.data.text per_rec_its[s.recording_id].remove(best_matching_seg) return sset -def apply_aligned_stms(recording_ids: list[str], processed_supervisions: SupervisionSet, - change_text: bool = False) -> tuple[SupervisionSet, SupervisionSet]: - aligned_for_asr_stm = retrieve_stm_file("https://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_asr.stm") - aligned_for_diar_stm = retrieve_stm_file("https://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_diar.stm") +def apply_aligned_stms( + recording_ids: list[str], processed_supervisions: SupervisionSet +) -> tuple[SupervisionSet, SupervisionSet]: + aligned_for_asr_stm = retrieve_stm_file( + "https://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_asr.stm" + ) + aligned_for_diar_stm = retrieve_stm_file( + "https://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_diar.stm" + ) - asr_sup = apply_stm(recording_ids, processed_supervisions, aligned_for_asr_stm, change_text) - diar_sup = apply_stm(recording_ids, processed_supervisions, aligned_for_diar_stm, change_text) + asr_sup = apply_stm(recording_ids, processed_supervisions, aligned_for_asr_stm) + diar_sup = apply_stm(recording_ids, processed_supervisions, aligned_for_diar_stm) return asr_sup, diar_sup From ceb7ebbf133dc03d275e22b8c570f568da73ae32 Mon Sep 17 00:00:00 2001 From: Matthew Maciejewski Date: Thu, 3 Oct 2024 09:45:48 -0400 Subject: [PATCH 09/10] added sbcsae to docs and fixed python compatibility --- docs/corpus.rst | 2 ++ lhotse/recipes/sbcsae.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/corpus.rst b/docs/corpus.rst index 6a5be4f97..bc8d71bfb 100644 --- a/docs/corpus.rst +++ b/docs/corpus.rst @@ -173,6 +173,8 @@ a CLI tool that create the manifests given a corpus directory. - :func:`lhotse.recipes.prepare_reazonspeech` * - RIRs and Noises Corpus (OpenSLR 28) - :func:`lhotse.recipes.prepare_rir_noise` + * - SBCSAE + - :func:`lhotse.recipes.prepare_sbcsae` * - Spatial-LibriSpeech - :func:`lhotse.recipes.prepare_spatial_librispeech` * - Speech Commands diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py index 504e9f337..d318678ad 100644 --- a/lhotse/recipes/sbcsae.py +++ b/lhotse/recipes/sbcsae.py @@ -41,7 +41,7 @@ from dataclasses import dataclass from math import inf from pathlib import Path -from typing import Dict, Optional, Union +from typing import Dict, List, Optional, Union from tqdm import tqdm @@ -997,7 +997,7 @@ class StmSegment: channel: str = "1" -def parse_stm_file(data: str) -> list[StmSegment]: +def parse_stm_file(data: str) -> List[StmSegment]: lines = data.split("\n") stm_segments = [] @@ -1024,7 +1024,7 @@ def parse_stm_file(data: str) -> list[StmSegment]: return stm_segments -def retrieve_stm_file(url) -> list[StmSegment]: +def retrieve_stm_file(url) -> List[StmSegment]: import urllib.request response = urllib.request.urlopen(url) From 538c4be5b0e9d8f9e3402228cd2e931e8a776987 Mon Sep 17 00:00:00 2001 From: Matthew Maciejewski Date: Fri, 4 Oct 2024 10:31:12 -0400 Subject: [PATCH 10/10] more python3.8 fixes --- lhotse/recipes/sbcsae.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py index d318678ad..4927548ba 100644 --- a/lhotse/recipes/sbcsae.py +++ b/lhotse/recipes/sbcsae.py @@ -41,7 +41,7 @@ from dataclasses import dataclass from math import inf from pathlib import Path -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Tuple, Union from tqdm import tqdm @@ -1050,9 +1050,9 @@ def compute_iou(seg1: SupervisionSegment, seg2: StmSegment) -> float: def apply_stm( - recording_ids: list[str], + recording_ids: List[str], supervisions: SupervisionSet, - aligned_stm_segs: list[StmSegment], + aligned_stm_segs: List[StmSegment], ) -> SupervisionSet: if not is_module_available("intervaltree"): @@ -1140,8 +1140,8 @@ def apply_stm( def apply_aligned_stms( - recording_ids: list[str], processed_supervisions: SupervisionSet -) -> tuple[SupervisionSet, SupervisionSet]: + recording_ids: List[str], processed_supervisions: SupervisionSet +) -> Tuple[SupervisionSet, SupervisionSet]: aligned_for_asr_stm = retrieve_stm_file( "https://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_asr.stm" )