From 774bde8bafdc2c286f227a307a20c33dc7028cbc Mon Sep 17 00:00:00 2001
From: Matthew Maciejewski <mmaciej2@jhu.edu>
Date: Tue, 16 Jan 2024 11:37:15 -0500
Subject: [PATCH 01/10] initial commit

---
 lhotse/bin/modes/recipes/__init__.py |   1 +
 lhotse/bin/modes/recipes/sbcsae.py   |  37 ++
 lhotse/recipes/__init__.py           |   1 +
 lhotse/recipes/sbcsae.py             | 798 +++++++++++++++++++++++++++
 4 files changed, 837 insertions(+)
 create mode 100644 lhotse/bin/modes/recipes/sbcsae.py
 create mode 100644 lhotse/recipes/sbcsae.py

diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
index 5df6ef8de..11a98395b 100644
--- a/lhotse/bin/modes/recipes/__init__.py
+++ b/lhotse/bin/modes/recipes/__init__.py
@@ -62,6 +62,7 @@
 from .peoples_speech import *
 from .primewords import *
 from .rir_noise import *
+from .sbcsae import *
 from .speechcommands import *
 from .spgispeech import *
 from .stcmds import *
diff --git a/lhotse/bin/modes/recipes/sbcsae.py b/lhotse/bin/modes/recipes/sbcsae.py
new file mode 100644
index 000000000..23f36e09b
--- /dev/null
+++ b/lhotse/bin/modes/recipes/sbcsae.py
@@ -0,0 +1,37 @@
+from typing import Optional, Sequence
+
+import click
+
+from lhotse.bin.modes import download, prepare
+from lhotse.recipes.sbcsae import download_sbcsae, prepare_sbcsae
+from lhotse.utils import Pathlike
+
+__all__ = ["sbcsae"]
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+def sbcsae(
+    corpus_dir: Pathlike,
+    output_dir: Pathlike,
+):
+    """SBCSAE data preparation."""
+    prepare_sbcsae(corpus_dir, output_dir=output_dir)
+
+
+@download.command(context_settings=dict(show_default=True))
+@click.argument("target_dir", type=click.Path())
+@click.option(
+    "--download-mp3",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help="Download the mp3 copy of the audio as well as wav.",
+)
+def sbcsae(
+    target_dir: Pathlike,
+    download_mp3: Optional[bool] = False,
+):
+    """SBCSAE download."""
+    download_sbcsae(target_dir, download_mp3=download_mp3)
diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py
index 57e129ea1..2fc7a6f0b 100644
--- a/lhotse/recipes/__init__.py
+++ b/lhotse/recipes/__init__.py
@@ -63,6 +63,7 @@
 from .nsc import prepare_nsc
 from .peoples_speech import prepare_peoples_speech
 from .rir_noise import download_rir_noise, prepare_rir_noise
+from .sbcsae import prepare_sbcsae
 from .speechcommands import download_speechcommands, prepare_speechcommands
 from .spgispeech import download_spgispeech, prepare_spgispeech
 from .stcmds import download_stcmds, prepare_stcmds
diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py
new file mode 100644
index 000000000..f855e2d01
--- /dev/null
+++ b/lhotse/recipes/sbcsae.py
@@ -0,0 +1,798 @@
+"""
+This script downloads and prepares the data directory for the Santa Barbara
+Corpus of Spoken American English.
+
+The Santa Barbara Corpus of Spoken American English is based on a large body of
+recordings of naturally occurring spoken interaction from all over the United
+States. The Santa Barbara Corpus represents a wide variety of people of
+different regional origins, ages, occupations, genders, and ethnic and social
+backgrounds. The predominant form of language use represented is face-to-face
+conversation, but the corpus also documents many other ways that that people use
+language in their everyday lives: telephone conversations, card games, food
+preparation, on-the-job talk, classroom lectures, sermons, story-telling, town
+hall meetings, tour-guide spiels, and more.
+
+The Santa Barbara Corpus was compiled by researchers in the Linguistics
+Department of the University of California, Santa Barbara. The Director of the
+Santa Barbara Corpus is John W. Du Bois, working with Associate Editors Wallace
+L. Chafe and Sandra A. Thompson (all of UC Santa Barbara), and Charles Meyer
+(UMass, Boston). For the publication of Parts 3 and 4, the authors are John W.
+Du Bois and Robert Englebretson.
+
+TODO: detail on splits and such
+"""
+import logging
+import re
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Sequence, Union
+
+from lhotse import Recording, RecordingSet, SupervisionSegment, SupervisionSet
+from lhotse.utils import Pathlike, resumable_download
+
+TALKBANK_MP3_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/"
+TALKBANK_WAV_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/0wav/"
+
+lang_iterators = {
+    "SBC004": iter(["Spanish"] * 17),
+    "SBC006": iter(["French"] * 2),
+    "SBC010": iter(["Spanish"]),
+    "SBC012": iter(["Greek"] * 2),
+    "SBC015": iter(["Spanish"] * 10),
+    "SBC025": iter(["German"] * 2 + ["Latin"]),
+    "SBC027": iter(["Spanish"] * 6 + ["French"] * 2),
+    "SBC031": iter(["French"] * 2),
+    "SBC033": iter(["French"]),
+    "SBC034": iter(["French"] * 3),
+    "SBC036": iter(["Spanish"] * 36),
+    "SBC037": iter(["Spanish"] * 60),
+    "SBC047": iter(["Spanish"]),
+    "SBC057": iter(["Japanese"] * 62),
+    "SBC058": iter(["Spanish"] + ["Italian"] * 2),
+}
+
+
+class Dummy_Spk_Iterator:
+    def __init__(self):
+        self.ind = 213
+
+    def next(self, spk="SBCXXX_X"):
+        self.ind = self.ind + 1
+        name = "_".join(spk.split("_")[1:])
+        if name.startswith("X") or name.startswith("AUD"):
+            name = "UNK"
+        return f"{self.ind:04d}_{name}"
+
+
+dummy_spk_iterator = Dummy_Spk_Iterator()
+
+
+def download_sbcsae(
+    target_dir: Pathlike = ".",
+    download_mp3: Optional[bool] = False,
+) -> Path:
+    """
+    Download the dataset. Due to availability/broken link issues, this downloads
+    from multiple sources.
+
+    :param: target_dir: Pathlike, the path of the directory where the SBCSAE
+        dataset will be downloaded.
+    :param: download_mp3: bool, if True download the mp3 files as well as wav.
+    :return: The path to the directory with the data.
+    """
+    target_dir = Path(target_dir)
+    corpus_dir = target_dir / "SBCSAE"
+    corpus_dir.mkdir(parents=True, exist_ok=True)
+
+    completed_detector = target_dir / ".sbcsae_completed"
+    if completed_detector.is_fil():
+        logging.info(f"Skipping download because {completed_detector} exists.")
+        return corpus_dir
+    return "FALSE"
+
+
+def prepare_sbcsae(
+    corpus_dir: Pathlike,
+    output_dir: Optional[Pathlike] = None,
+) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
+    """
+    Prepares manifest for SBCSAE dataset.
+
+    :param: corpus_dir: Path to the root where SBCSAE data was downloaded. It
+        should be called SBCSAE. There is no consistent formatting between
+        releases of the data. Check script comments for details if using an
+        existing corpus download rather than Lhotse's download script.
+    :param: output_dir: Root directory where .json manifests are stored.
+    :return:
+    """
+    # Resolve corpus_dir type
+    if isinstance(corpus_dir, str):
+        corpus_dir = Path(corpus_dir)
+
+    # Resolve output_dir type
+    if isinstance(output_dir, str):
+        output_dir = Path(output_dir)
+
+    audio_dir = corpus_dir / "WAV"
+    recordings = RecordingSet.from_recordings(
+        Recording.from_file(p) for p in audio_dir.glob("*.wav")
+    )
+    if len(recordings) == 0:
+        logging.warning(f"No .wav files found in {audio_dir}")
+
+    doc_dir = corpus_dir / "documentation"
+    spk2gen_dict, spk2glob_dict = generate_speaker_map_dicts(doc_dir)
+
+    supervisions = []
+    trn_dir = corpus_dir / "TRN"
+    for p in trn_dir.glob("*.trn"):
+        for supervision in _filename_to_supervisions(p, spk2gen_dict, spk2glob_dict):
+            supervisions.append(supervision)
+
+    if len(supervisions) == 0:
+        logging.warning(f"No supervisions found in {trn_dir}")
+    supervisions = SupervisionSet.from_segments(supervisions)
+
+    if output_dir is not None:
+        if isinstance(output_dir, str):
+            output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        recordings.to_file(output_dir / "sbcsae_recordings.jsonl.gz")
+        supervisions.to_file(output_dir / "sbcsae_supervisions.jsonl.gz")
+
+    manifests = {"recordings": recordings, "supervisions": supervisions}
+
+    return manifests
+
+
+def generate_speaker_map_dicts(doc_dir: Path):
+    spk2gen_dict = dict()
+    spk2glob_dict = dict()
+
+    spk_num_to_reco_ids = dict()
+    for LDC_split in ["LDC2000S85", "LDC2003S06", "LDC2005S25"]:
+        filename = doc_dir / LDC_split / "segment.tbl"
+        for line in filename.read_text().split("\n"):
+            if "speaker:" in line:
+                line = line.replace(" 0", "\t0")
+                reco_id = re.sub(r"sbc0?([0-9]{3})\s.*", r"SBC\1", line)
+                spk_num = line.split("\t")[-1][:4]
+                if spk_num not in spk_num_to_reco_ids:
+                    spk_num_to_reco_ids[spk_num] = []
+                if reco_id not in spk_num_to_reco_ids[spk_num]:
+                    spk_num_to_reco_ids[spk_num].append(reco_id)
+
+    for LDC_split in ["LDC2000S85", "LDC2003S06", "LDC2005S25"]:
+        filename = doc_dir / LDC_split / "speaker.tbl"
+        for line in filename.read_text().split("\n"):
+            if "," not in line:
+                continue
+            line = line.replace("0163,Dan,m", "0166,Dan,M")
+            spk_num, name, gen = line.split(",")[:3]
+            name = (
+                name.replace(" (extra-corpus)", "").upper().split(" ")[-1].split("/")[0]
+            )
+            gen = gen.upper()
+            if not gen:
+                gen = None
+
+            if spk_num in ["0069", "0091", "0092", "0097"]:
+                continue
+            for reco in spk_num_to_reco_ids[spk_num]:
+                spk2gen_dict[reco + "_" + name] = gen
+                spk2glob_dict[reco + "_" + name] = spk_num + "_" + name
+
+    for LDC_split in ["LDC2004S10"]:
+        seg_list = []
+        filename = doc_dir / LDC_split / "segment.tbl"
+        for line in filename.read_text().split("\n"):
+            if "speaker:" in line:
+                reco_id = re.sub(r"sbc0?([0-9]{3})\s.*", r"SBC\1", line)
+                name = line.split(" ")[-1].upper().split("/")[0]
+                seg_list.append([name, reco_id])
+
+        spk_list = []
+        filename = doc_dir / LDC_split / "speaker.tbl"
+        for line in filename.read_text().split("\n"):
+            if "," not in line:
+                continue
+            spk_num, name, gen = line.split(",")[:3]
+            name = name.upper().split("/")[0]
+            spk_list.append([name, spk_num, gen])
+
+        for seg_info, spk_info in zip(seg_list, spk_list):
+            assert seg_info[0] == spk_info[0], f"{seg_info[0]} != {spk_info[0]}"
+            spk2gen_dict[seg_info[1] + "_" + seg_info[0]] = spk_info[2]
+            spk2glob_dict[seg_info[1] + "_" + seg_info[0]] = (
+                spk_info[1] + "_" + spk_info[0]
+            )
+
+    for spk_key in [
+        "SBC006_ALL",
+        "SBC008_ALL",
+        "SBC012_MANY",
+        "SBC020_AUD",
+        "SBC021_MANY",
+        "SBC023_MANY",
+        "SBC025_AUD",
+        "SBC026_AUD",
+        "SBC027_MANY",
+        "SBC027_AUD",
+        "SBC028_BOTH",
+        "SBC030_AUD",
+        "SBC038_AUD",
+        "SBC053_RADIO",
+        "SBC054_AUD",
+        "SBC054_MANY",
+        "SBC055_AUD",
+    ]:
+        spk2gen_dict[spk_key] = None
+        spk2glob_dict[spk_key] = spk_key
+
+    return spk2gen_dict, spk2glob_dict
+
+
+def _filename_to_supervisions(filename: Path, spk2gen_dict: dict, spk2glob_dict: dict):
+    reco_id = filename.stem.split(".")[0]
+    lines = filename.read_text(encoding="latin1")
+    supervisions = []
+
+    #### Transcript fix
+    lines = lines.replace("\x92", "'")
+    lines = lines.replace("\u007f", "")
+    lines = lines.replace("\u0000", "c")
+
+    if reco_id == "SBC002":
+        lines = lines.replace("(TSK ", "(TSK) ")
+    elif reco_id == "SBC004":
+        lines = lines.replace("KATE", "KATHY")
+        lines = lines.replace("sen~orita", "se\xf1orita")
+    elif reco_id == "SBC005":
+        lines = lines.replace("good_/god/", "good")
+        lines = lines.replace("(H)@>", "(H) @>")
+        lines = lines.replace("[@@ <@Mm@>]", "[@@ <@ Mm @>]")
+    elif reco_id == "SBC006":
+        lines = lines.replace("/pub/", "pub")
+        lines = lines.replace("<WH@@@@ (H) @@WH>", "<WH @@@@ (H) @@ WH>")
+        lines = lines.replace("[2(H)2]1", "[2(H)2]")
+    elif reco_id == "SBC007":
+        lines = lines.replace(
+            "\\000000000 000000000 MARY: 1182.90 1186.92\t        ",
+            "\n1182.90 1186.92\tMARY:   ",
+        )
+        lines = lines.replace("(YAWN0", "(YAWN)")
+    elif reco_id == "SBC008":
+        lines = lines.replace("[<X Go]=dX>", "[<X Go]=d X>")
+    elif reco_id == "SBC012":
+        lines = lines.replace(
+            "\n".join(["807.02 807.92\tFRANK:  \t.. Mhm."] * 2),
+            "807.02 807.92\tFRANK:  \t.. Mhm.",
+        )
+        lines = lines.replace("MONTOYA", "MONTOYO")
+    elif reco_id == "SBC013":
+        lines = lines.replace("[8<@She8]", "[8<@ She8]")
+        lines = lines.replace("[2(H) cou_ couch@>2]", "[2(H) cou_ couch @>2]")
+        lines = lines.replace("[4<@No=4]", "[4<@ No=4]")
+        lines = lines.replace("VOX2]", "VOX>2]")
+    elif reco_id == "SBC014":
+        lines = lines.replace("\\000000000 000000000 ", "\n")
+        lines = lines.replace("<@he thought", "<@ he thought")
+    elif reco_id == "SBC015":
+        lines = lines.replace(
+            "243.055\t244.080\tKEN:\t(H)] the little,",
+            "243.465\t244.670\tKEN:\t(H)] the little,",
+        )
+        lines = lines.replace("\u0000urch things.", "church things.")
+        lines = lines.replace("2(H]=2", "2(H)=2")
+        lines = lines.replace(" 0.000000e+00", "e")
+        lines = lines.replace("0m=,", "um=,")
+        lines = lines.replace("0eople", "people")
+        lines = lines.replace("0id", "did")
+        lines = lines.replace("X 0ne %tho", "X uh line %tho")
+        lines = lines.replace("and 0t [was]", "and it [was]")
+        lines = lines.replace("0t was like", "it was like")
+    elif reco_id == "SBC016":
+        lines = lines.replace("/sed ai/", "sed ai")
+    elif reco_id == "SBC017":
+        lines = lines.replace("a\tand names the] na=me,", "and names the] na=me,")
+        lines = lines.replace(" 0.000000e+00", "e")
+        lines = lines.replace("[2I mean2", "[2I mean2]")
+        lines = lines.replace("no2.", "no.")
+        lines = lines.replace("0rganisms", "organisms")
+        lines = lines.replace("0ttle", "little")
+    elif reco_id == "SBC018":
+        lines = lines.replace("0f", "if")
+    elif reco_id == "SBC019":
+        lines = lines.replace("cello_(/cheller/)", "cheller")
+        lines = lines.replace("(sigh)", "(SIGH)")
+        lines = lines.replace("<F<VOX> Mo=m", "<F<VOX Mo=m")
+        lines = lines.replace("@@[3@=3", "@@[3@=3]")
+        lines = lines.replace("[#5Jason", "[5#Jason")
+        lines = lines.replace("[20nh2]", "[2Unh2]")
+        lines = lines.replace("Draw 0n", "Draw on")
+        lines = lines.replace("0oes", "Does")
+        lines = lines.replace("0=kay", "O=kay")
+    elif reco_id == "SBC020":
+        lines = lines.replace("(COUGh)", "(COUGH)")
+        lines = lines.replace("(throat)", "(THROAT)")
+        lines = lines.replace("S-  0emon", "S- demon")
+        lines = lines.replace(" 0.000000E+00", "E")
+        lines = lines.replace("now 0m", "now um")
+        lines = lines.replace("uh  0s", "uh is")
+        lines = lines.replace("but  0n", "but uh in")
+        lines = lines.replace("i- % 0t's", "i- uh it's")
+        lines = lines.replace("0retty", "pretty")
+        lines = lines.replace("AUD:\tY", "X:\tY")
+    elif reco_id == "SBC022":
+        lines = lines.replace("(h)", "(H)")
+        lines = lines.replace("0.000000e+00", "e-")
+        lines = lines.replace("0ttle", "little")
+        lines = lines.replace("0ne thing", "uh one thing")
+    elif reco_id == "SBC023":
+        lines = lines.replace("JANICD", "JANICE")
+        lines = lines.replace("NORA?", "NORA")
+        lines = lines.replace("SUE?", "SUE")
+        lines = lines.replace("(throat)", "(THROAT)")
+        lines = lines.replace("2(SNIFF2", "2(SNIFF)2")
+        lines = lines.replace("[<Xbu=tX>]", "[<X bu=t X>]")
+        lines = lines.replace("<or did it", "<Q or did it")
+        lines = lines.replace("x>5]", "X>5]")
+        lines = lines.replace("0nly", "uh only")
+        lines = lines.replace("[50r5]", "[5Or5]")
+    elif reco_id == "SBC024":
+        lines = lines.replace(" >ENV: ", ">ENV:\t")
+        lines = lines.replace(" 0.000000irst", "First")
+        lines = lines.replace("2[cause", "[2cause")
+        lines = lines.replace(" 0oes", "does")
+        lines = lines.replace("0id]", "did]")
+    elif reco_id == "SBC025":
+        lines = lines.replace("<ot,", "<% not,")
+        lines = lines.replace(" 0.000000e+00", "e")
+        lines = lines.replace("0mself", "himself")
+    elif reco_id == "SBC026":
+        lines = lines.replace("does_(/uz/)", "does")
+        lines = lines.replace(" 0.000000e+00", "e")
+        lines = lines.replace("0ngoing", "ongoing")
+        lines = lines.replace("AUD:\t<X", "X_2:\t<X")
+    elif reco_id == "SBC027":
+        lines = lines.replace("142.870\t144.790 :", "142.870\t144.790")
+        lines = lines.replace("451.510\t452.130 :", "451.510\t452.130")
+        lines = lines.replace(" 0oing", "doing")
+        lines = lines.replace("AUD:\t.. [We", "X:\t.. [We")
+        lines = lines.replace("AUD:\t... Liquid", "X_1:\t... Liquid")
+        lines = lines.replace("AUD:\tAdd", "X_2:\tAdd")
+        lines = lines.replace("AUD:\t     [", "X_3:\t     [")
+        lines = lines.replace("AUD1:\t... One", "X_4:\t... One")
+        lines = lines.replace("AUD2:\t[One", "X_5:\t[One")
+        lines = lines.replace("AUD:\t...X [X", "X_6:\tX [X")
+        lines = lines.replace("AUD1:\tEight", "X_7:\tEight")
+        lines = lines.replace("AUD2:\t... [@", "AUD:\t... [@")
+        lines = lines.replace("AUD3:\t    [Four", "X_8:\t    [Four")
+        lines = lines.replace("AUD:\t... Seven", "X_9:\t... Seven")
+        lines = lines.replace("AUD1:\t.. <L2", "X_10:\t.. <L2")
+        lines = lines.replace("AUD2:\t        [", "X_11:\t       [")
+        lines = lines.replace("AUD:\t... <L2", "X_12:\t... <L2")
+        lines = lines.replace("AUD1:\t... [E", "X_13:\t... [E")
+        lines = lines.replace("AUD2:\t    [<L2", "X_14:\t    [<L2")
+        lines = lines.replace("AUD1:\t     ", "X_15:\t     ")
+        lines = lines.replace("AUD2:\t... There", "X_16:\t... There")
+        lines = lines.replace("AUD1:\t[Pull", "X_17:\t[Pull")
+        lines = lines.replace("AUD2:\tYou", "X_18:\tYou")
+        lines = lines.replace("AUD:\t[<X", "X_19:\t[<X")
+        lines = lines.replace("AUD:\t... Solid", "X_20:\t... Solid")
+        lines = lines.replace("AUD:\t.. Hydrogen", "X_21:\t.. Hydrogen")
+        lines = lines.replace("AUD:\t.. Oxygen", "X_22:\t.. Oxygen")
+        lines = lines.replace("AUD:\t.. [<", "X_23:\t.. [<")
+        lines = lines.replace("AUD:\t       ", "X_24:\t       ")
+        lines = lines.replace("AUD:\tThey're", "X_25:\tThey're")
+        lines = lines.replace("AUD:\t XXX", "X_26:\t XXX")
+        lines = lines.replace("AUD:\t... No", "X_27:\t... No")
+        lines = lines.replace("AUD:\t<X", "X_28:\t<X")
+        lines = lines.replace("AUD:\tThrow", "X_29:\tThrow")
+        lines = lines.replace("AUD:\tHotter", "X_30:\tHotter")
+        lines = lines.replace("AUD:\t.. Liquid", "X_31:\t.. Liquid")
+        lines = lines.replace("AUD:\t Did", "X_32:\t Did")
+        lines = lines.replace("AUD:\tX", "X_33:\tX")
+    elif reco_id == "SBC028":
+        lines = lines.replace(
+            "482.610\t484.010\tJILL_S: ", "482.610\t484.010\tJILL_S:\t"
+        )
+        lines = lines.replace("<@Oh[2=@>", "<@ Oh[2= @>")
+        lines = lines.replace(" 0.000000", " ")
+        lines = lines.replace("i 0f", "i- if")
+        lines = lines.replace("0f we", "if we")
+        lines = lines.replace("th- 0t's", "th- that's")
+        lines = lines.replace("0t's", "it's")
+        lines = lines.replace("0f", "if")
+    elif reco_id == "SBC029":
+        lines = lines.replace("96.230\t98.240\t>ENV: ", "96.230\t98.240\t>ENV:\t")
+        lines = lines.replace("(H )", "(H)")
+        lines = lines.replace("<0h=,", "<% Oh=,")
+        lines = lines.replace("knowX>]", "know X>]")
+        lines = lines.replace("0verheating", "overheating")
+    elif reco_id == "SBC030":
+        lines = lines.replace("DANNY", "BRADLEY")
+        lines = lines.replace("AUD:\tYes", "X:\tYes")
+    elif reco_id == "SBC034":
+        lines = lines.replace("13548.02 ", "1354.802")
+    elif reco_id == "SBC036":
+        lines = lines.replace(
+            "1558.463\t1558.906\t\t[thought he was,",
+            "1558.906\t1558.923\t\t[thought he was,",
+        )
+    elif reco_id == "SBC038":
+        lines = lines.replace("AUD:\t... What's", "X_2:\t... What's")
+        lines = lines.replace("AUD:\t... U", "X_3:\t... U")
+        lines = lines.replace("AUD:\t... How far", "X_2:\t... How far")
+        lines = lines.replace("AUD:\t<X Quite", "X_4:\t<X Quite")
+        lines = lines.replace("AUD:\tYeah", "X_5:\tYeah")
+        lines = lines.replace("AUD:\tAbout", "X_6:\tAbout")
+        lines = lines.replace("AUD:\t... That", "X_7:\t... That")
+        lines = lines.replace("AUD:\t.. <X Oh", "X_8:\t.. <X Oh")
+        lines = lines.replace("AUD:\t... How long", "X_3:\t... How long")
+        lines = lines.replace("AUD:\t<X @", "X_3:\t<X @")
+        lines = lines.replace("AUD:\tEach", "X_2:\tEach")
+        lines = lines.replace("AUD:\tThe water", "X_2:\tThe water")
+        lines = lines.replace("AUD:\t[Right", "X_9:\t[Right")
+        lines = lines.replace("AUD:\t... It's", "X_9:\t... It's")
+        lines = lines.replace("AUD:\t[Perp", "X_9:\t[Perp")
+        lines = lines.replace("AUD:\t[2perp", "X_9:\t[2perp")
+        lines = lines.replace("AUD:\t[3The", "X_9:\t[3The")
+        lines = lines.replace("AUD:\t[4Right", "X_9:\t[4Right")
+        lines = lines.replace("AUD:\tOh yeah", "X_9:\tOh yeah")
+        lines = lines.replace("AUD:\t[6Now", "X_9:\t[6Now")
+        lines = lines.replace("AUD:\twith the", "X_9:\twith the")
+        lines = lines.replace("AUD:\t[That-", "X_9:\t[That-")
+        lines = lines.replace("AUD:\t[Spinning", "X_9:\t[Spinning")
+        lines = lines.replace("AUD:\t[2Yeah", "X_9:\t[2Yeah")
+        lines = lines.replace("AUD:\t[3X", "X_9:\t[3X")
+        lines = lines.replace("AUD:\t[4<X", "X_9:\t[4<X")
+        lines = lines.replace("AUD:\tAnd that's", "X_9:\tAnd that's")
+        lines = lines.replace("AUD:\t[So", "X_9:\t[So")
+        lines = lines.replace("AUD:\t[2that's", "X_9:\t[2that's")
+        lines = lines.replace("AUD:\tthat's3", "X_9:\tthat's3")
+        lines = lines.replace("AUD:\tWe", "X_9:\tWe")
+        lines = lines.replace("AUD:\t.. All", "X_9:\t.. All")
+        lines = lines.replace("AUD:\t.. What's", "X_10:\t.. What's")
+        lines = lines.replace("AUD:\t... Are", "X_3:\t... Are")
+        lines = lines.replace("AUD:\tThe rest", "X_11:\tThe rest")
+        lines = lines.replace("AUD:\t... Y'all", "X_12:\t... Y'all")
+        lines = lines.replace("AUD:\t... Is", "X_13:\t... Is")
+        lines = lines.replace("AUD:\t[<X", "X_13:\t[<X")
+        lines = lines.replace("AUD:\t[Yeah", "X_13:\t[Yeah")
+        lines = lines.replace("AUD:\t... What are", "X_13:\t... What are")
+        lines = lines.replace("AUD_2", "AUD")
+        lines = lines.replace("AUD:\t[What are", "X_13:\t[What are")
+        lines = lines.replace("AUD:\t... Say", "X_14:\t... Say")
+        lines = lines.replace("AUD:\t[what's", "X_14:\t[what's")
+        lines = lines.replace("AUD:\t.. Hmm", "X_14:\t.. Hmm")
+        lines = lines.replace("AUD:\t[3When", "X_14:\t[3When")
+        lines = lines.replace("AUD:\t[It's", "X_15:\t[It's")
+        lines = lines.replace("AUD:\t... Have", "X_16:\t... Have")
+        lines = lines.replace("AUD:\tThanks", "X_17:\tThanks")
+        lines = lines.replace("AUD:\t... Wow", "X_13:\t... Wow")
+    elif reco_id == "SBC040":
+        lines = lines.replace("AUD:\t... What's", "X:\t... What's")
+        lines = lines.replace("AUD:\t... He", "X_2:\t... He")
+        lines = lines.replace("AUD:\t[What", "X_3:\t[What")
+        lines = lines.replace("AUD:\t.. Isn't", "X_4:\t.. Isn't")
+        lines = lines.replace("AUD:\tClaiborne", "X_4:\tClaiborne")
+        lines = lines.replace("AUD:\t... How", "X_4:\t... How")
+        lines = lines.replace("AUD:\t.. How", "X_4:\t.. How")
+        lines = lines.replace("AUD:\t.. The", "X_5:\t.. The")
+        lines = lines.replace("AUD:\t... Yes", "X_6:\t... Yes")
+    elif reco_id == "SBC043":
+        lines = lines.replace("< HI any nights HI>", "<HI any nights HI>")
+        lines = lines.replace("ANNETTE", "ANETTE")
+    elif reco_id == "SBC048":
+        lines = lines.replace("<@in San[2ta", "<@ in San[2ta")
+    elif reco_id == "SBC052":
+        lines = lines.replace("~Janine\t said", "~Janine said")
+    elif reco_id == "SBC054":
+        lines = lines.replace("<VOX Ugh VOX >", "<VOX Ugh VOX>")
+        lines = lines.replace("AUD:\tX", "X:\tX")
+        lines = lines.replace("AUD:\t<X", "X_2:\t<X")
+        lines = lines.replace("AUD_2:\t[Tha-]", "X_3:\t[Tha-]")
+        lines = lines.replace("AUD_3:\t[Tha-]", "X_4:\t[Tha-]")
+        lines = lines.replace("AUD:\t[@rhino", "X_5:\t[@rhino")
+        lines = lines.replace("AUD_2", "AUD")
+    elif reco_id == "SBC055":
+        lines = lines.replace("in spite ..\tof having", "in spite .. of having")
+        lines = lines.replace("AUD:\t... Beatrice", "X:\t... Beatrice")
+        lines = lines.replace("AUD:\tHow was", "X_2:\tHow was")
+        lines = lines.replace("AUD:\tCan", "X_3:\tCan")
+        lines = lines.replace("AUD_2:", "X_4:")
+    elif reco_id == "SBC056":
+        lines = lines.replace("@@@2]\t[3@@@@3]", "@@@2] [3@@@@3]")
+        lines = lines.replace("(sniff)", "(SNIFF)")
+    elif reco_id == "SBC057":
+        lines = lines.replace("Hane-makikomi", "<L2 Hane-makikomi L2>")
+        lines = lines.replace("sensei", "<L2 sensei L2>")
+        lines = lines.replace("ippon", "Ippon")
+        lines = lines.replace("Ippon", "<L2 Ippon L2>")
+        lines = re.sub(r"gi([^a-z])", r"<L2 gi L2>\1", lines)
+        lines = re.sub(r"Makikomi([^-])", r"<L2 Makikomi L2>\1", lines)
+        lines = lines.replace("Hane-goshi", "<L2 Hane-goshi L2>")
+        lines = lines.replace("Sode-makikomi", "<L2 Sode-makikomi L2>")
+        lines = lines.replace("shiai", "<L2 shiai L2>")
+        lines = lines.replace("randori", "<L2 randori L2>")
+        lines = re.sub(r"Sode([^-])", r"<L2 Sode L2>\1", lines)
+        lines = lines.replace("Ukemi", "<L2 Ukemi L2>")
+        lines = lines.replace("Ha-jime", "<L2 Ha-jime L2>")
+        lines = lines.replace("Ude-garami", "<L2 Ude-garami L2>")
+        lines = lines.replace("Hane-uchi-mata", "<L2 Hane-uchi-mata L2>")
+        lines = lines.replace("Uchi-<X mother X>", "Uchi-mata")
+        lines = lines.replace("Uchi-mata", "<L2 Uchi-mata L2>")
+        lines = lines.replace("Hande-maki- <L2 ", "<L2 Hande-maki- ")
+        lines = re.sub(r"Hane([^-])", r"<L2 Hane L2>\1", lines)
+        lines = lines.replace("%Sode-maki[komi]", "<L2 %Sode-maki[komi] L2>")
+        lines = lines.replace("Tsuri-komi", "<L2 Tsuri-komi L2>")
+        lines = lines.replace("Uchi-komi", "<L2 Uchi-komi L2>")
+        lines = lines.replace("O-uchi", "<L2 O-uchi L2>")
+        lines = lines.replace("Goshi", "<L2 Goshi L2>")
+        lines = lines.replace("Uchi]-mata", "<L2 Uchi]-mata L2>")
+        lines = lines.replace("Komi", "<L2 Komi L2>")
+        lines = lines.replace("Tani-otoshi", "<L2 Tani-otoshi L2>")
+        lines = lines.replace("Hane-maki][2komi=", "<L2 Hane-maki][2komi= L2>")
+        lines = lines.replace("Makikomi-waza", "<L2 Makikomi-waza L2>")
+        lines = lines.replace("Seoi", "<L2 Seoi L2>")
+        lines = lines.replace("uke", "<L2 uke L2>")
+    elif reco_id == "SBC059":
+        lines = lines.replace("[<F 3And you", "<F [3And you")
+        lines = lines.replace("hour[6=6 F>]", "hour[6=6] F>")
+
+    spk_buffer = ""
+    lang_buffer = "English"
+    for line in lines.split("\n"):
+        #### Transcript fixes
+        if line == "77.200\t77.540 :\t(H)":
+            continue
+        if line.startswith("000000000 000000000 ") or line.startswith("0.00 0.00"):
+            continue
+        if line.startswith("\t"):
+            line.lstrip("\t")
+        if "and in his pamphlet the Liber Arbetrio" in line:
+            continue
+
+        line = line.strip()
+        line = re.sub(r" +", " ", line)
+        line = re.sub(r"\t+", "\t", line)
+        fields = line.strip().split("\t")
+        if len(fields) == 4:
+            spk_field, raw_trans = fields[2:]
+            start, end = [float(time.rstrip()) for time in fields[:2]]
+        elif len(fields) == 3:
+            if len(fields[0].rstrip().split(" ")) > 1:
+                spk_field, raw_trans = fields[1:]
+                start, end = [float(time) for time in fields[0].split(" ")[:2]]
+                raw_trans = fields[-1]
+            else:
+                start, end = [float(time.rstrip()) for time in fields[:2]]
+                spk_field_candidate = fields[2].split(" ")[0]
+                if re.fullmatch(r"[A-Z]+:", spk_field_candidate):
+                    spk_field = spk_field_candidate
+                    raw_trans = " ".join(fields[2].split(" ")[1:])
+                else:
+                    spk_field = ""
+                    raw_trans = fields[2]
+        elif len(fields) == 2:
+            timesish = fields[0].rstrip().split(" ")
+            if len(timesish) == 1:
+                continue
+            start, end = [float(time) for time in timesish[:2]]
+            if len(timesish) > 2:
+                spk_field = timesish[2]
+                raw_trans = fields[1]
+            else:
+                spk_field_candidate = fields[1].split(" ")[0]
+                if re.fullmatch(r"[A-Z]+:", spk_field_candidate):
+                    spk_field = spk_field_candidate
+                    raw_trans = " ".join(fields[1].split(" ")[1:])
+                else:
+                    spk_field = ""
+                    raw_trans = fields[1]
+        else:
+            split = line.split(" ")
+            if re.fullmatch(r"[0-9]+\.[0-9]+", split[0]) and re.fullmatch(
+                r"[0-9]+\.[0-9]+", split[1]
+            ):
+                start, end = [float(time.rstrip()) for time in split[:2]]
+                if re.fullmatch(r"[A-Z]+:", split[2]):
+                    spk_field = split[2]
+                    raw_trans = " ".join(split[3:])
+                else:
+                    spk_field = ""
+                    raw_trans = " ".join(split[2:])
+            else:
+                continue
+
+        #### Transcript fixes
+        if raw_trans == "[2<L2 Zocalo.":
+            raw_trans = "[2<L2 Zocalo L2>2]."
+        elif raw_trans == "[You're <L2 outre mer L2].":
+            raw_trans = "[You're <L2 outre mer L2>]."
+
+        if " $ " in raw_trans:
+            continue
+
+        spk_field = spk_field.strip().rstrip(":").rstrip().upper()
+        if spk_field in [">ENV", "ENV", ">MAC", ">DOG", ">HORSE", ">CAT", ">BABY"]:
+            continue
+        elif spk_field == "#READ":
+            spk_field = "WALT"
+
+        if spk_field:
+            spk_field = re.sub(r"^[^A-Z]", "", spk_field)
+            spk_buffer = spk_field
+
+        utt_id = f"{reco_id}_{int(start*1000):07}_{int(end*1000):07}_{spk_buffer}"
+
+        text, lang_tag = _parse_raw_transcript(raw_trans)
+
+        if "l" in lang_tag:
+            for _ in range(lang_tag.count("l")):
+                new_lang = next(lang_iterators[reco_id])
+            if "c" in lang_tag:
+                lang_buffer = f"English-{new_lang}"
+            else:
+                lang_buffer = new_lang
+        elif "c" in lang_tag:
+            lang_buffer = f"English-{lang_buffer.split('-')[-1]}"
+
+        spk_key = reco_id + "_" + spk_buffer
+        if spk_key not in spk2glob_dict and reco_id != "SBC021":
+            spk2gen_dict[spk_key] = None
+            spk2glob_dict[spk_key] = dummy_spk_iterator.next(spk_key)
+
+        if spk_key in spk2glob_dict:
+            speaker = spk2glob_dict[spk_key]
+            gender = spk2gen_dict[spk_key]
+        else:
+            speaker = dummy_spk_iterator.next(spk_key)
+            gender = None
+
+        if re.search(r"[A-Za-z]", text):
+            supervisions.append(
+                SupervisionSegment(
+                    id=utt_id,
+                    recording_id=reco_id,
+                    start=start,
+                    duration=end - start,
+                    channel=[0, 1],
+                    text=text,
+                    language=lang_buffer,
+                    speaker=speaker,
+                    gender=gender,
+                )
+            )
+
+        if lang_tag:
+            if lang_tag[-1] == "r":
+                lang_buffer = "English"
+            if lang_tag[-1] == "l":
+                lang_buffer = lang_buffer.split("-")[-1]
+
+    return supervisions
+
+
+def _parse_raw_transcript(transcript: str):
+
+    transcript = transcript.replace("0h", "oh")
+    transcript = transcript.replace("s@so", "s- so")
+    transcript = transcript.replace("la@ter", "later")
+    transcript = transcript.replace("you@.", "you @.")
+    transcript = transcript.replace("[N=]", "N")
+    transcript = transcript.replace("[2C2]=", "C")
+    transcript = transcript.replace("[MM=]", "MM")
+    transcript = transcript.replace("[I=]", "I")
+
+    transcript = transcript.replace("(YELL)", "<yell>")
+
+    transcript = transcript.replace("_", "-")
+
+    transcript = transcript.replace("=", "")
+    transcript = transcript.replace("%", "")
+
+    # Process overlapped UNKs before they get removed by the following step
+    transcript = re.sub(r"\[([2-9]?)([A-Z])+\1\]", r"\2", transcript)
+
+    # Paired parenthetical/bracket annotation remover
+    paren_matches = re.findall(r"\([^a-z@ ]*\)", transcript)
+    for paren_match in paren_matches:
+        transcript = transcript.replace(
+            paren_match, re.sub(r"[^\[\]]", "", paren_match)
+        )
+    brack_matches = re.findall(r"\[[^a-z@ ]+\]", transcript)
+    for brack_match in brack_matches:
+        transcript = transcript.replace(
+            brack_match, re.sub(r"[^\(\)]", "", brack_match)
+        )
+
+    transcript = re.sub(r"<<[^a-z@ ]+>>", "", transcript)
+    transcript = re.sub(r"<<[^a-z@ ]+", "", transcript)
+    transcript = re.sub(r"[^a-z@ ]+>>", "", transcript)
+
+    transcript = re.sub(r"<[^a-z@ ]+>", "", transcript)
+    transcript = re.sub(r"<[^a-z2 ]*[^2 ]([ <])", r"\1", transcript)
+    transcript = re.sub(r"([ >])[^a-z2 ]*[^a-z 2]>", r"\1", transcript)
+
+    transcript = re.sub(r"\[[2-9]?", "", transcript)
+    transcript = re.sub(r"[2-9]?\]", "", transcript)
+
+    transcript = transcript.replace("(Hx)", " ")
+    transcript = transcript.replace("(hx)", " ")
+    transcript = transcript.replace("(@Hx)", "@")
+
+    transcript = transcript.replace("(COUGH COUGH)", " ")
+    transcript = transcript.replace("(SNIFF", "")
+
+    transcript = transcript.replace("(", "")
+    transcript = transcript.replace(")", "")
+
+    transcript = transcript.replace("< ", " ")
+    transcript = transcript.replace(" >", " ")
+
+    transcript = re.sub(r"[^A-Za-z-]-+", "", transcript)
+    transcript = re.sub(r"\.\.+", "", transcript)
+
+    transcript = transcript.replace("+", "")
+    transcript = transcript.replace("&", "")
+    transcript = transcript.replace("#", "")
+    transcript = transcript.replace("*", "")
+
+    transcript = re.sub(r"!([A-Za-z])", r"\1", transcript)
+
+    # Deal with extra white space
+    transcript = re.sub(r" +", " ", transcript)
+
+    # Merge X's
+    transcript = re.sub(r"X+", "X", transcript)
+
+    # Parse laughter
+    transcript = transcript.replace("on@,", "on @,")
+    transcript = re.sub(r"([a-z-])@([a-z])", r"\1\2", transcript)
+    transcript = re.sub(r"@+", "@", transcript)
+    transcript = re.sub(r"(^| )@([^ ])", r" @ \2", transcript)
+    transcript = re.sub(r"([^ ])@( |$)", r"\1 @ ", transcript)
+    transcript = transcript.replace("@ @", "@").replace("@ @", "@")
+
+    transcript = re.sub(r"(^| )X([ ,.?']|$)", r"\1<UNK>\2", transcript)
+    transcript = re.sub(r"(^| )X([ ,.?']|$)", r"\1<UNK>\2", transcript)
+    transcript = re.sub(r"X-($| )", r"<UNK>\1", transcript)
+
+    transcript = re.sub(r"^ ", "", transcript)
+    transcript = re.sub(r" $", "", transcript)
+
+    transcript = transcript.replace(" .", ".")
+    transcript = transcript.replace(" ,", ",")
+    transcript = transcript.replace(" ?", "?")
+
+    transcript = re.sub(r"^\. ", "", transcript)
+    transcript = re.sub(r"^\.$", "", transcript)
+
+    if (
+        len(transcript.split("<L2")) > 1
+        and re.search(r"[A-Za-z]", transcript.split("<L2")[0])
+    ) or (
+        len(transcript.split("L2>")) > 1
+        and re.search(r"[A-Za-z]", transcript.split("L2>")[-1])
+    ):
+        lang_tag = "c"
+    else:
+        lang_tag = ""
+
+    transcript = transcript.replace("@", "<LAUGH>")
+    transcript = transcript.replace("<yell>", "<YELL>")
+
+    if "L2" in transcript:
+        lang_tag = lang_tag + re.sub(
+            r"(<L2|L2>)(?!.*(<L2|L2>)).*$",
+            r"\1",
+            re.sub(r".*?(<L2|L2>)", r"\1", transcript),
+        )
+        lang_tag = lang_tag.replace("<L2", "l").replace("L2>", "r")
+
+    # We choose to leave the language tags in, but uncommenting this would remove them.
+    #    transcript = transcript.replace("<L2 ", "")
+    #    transcript = transcript.replace(" L2>", "")
+
+    return transcript, lang_tag

From 43aa2d0e0bd34a0d4f29019b19333bc5fb90e7b1 Mon Sep 17 00:00:00 2001
From: Matthew Maciejewski <mmaciej2@jhu.edu>
Date: Thu, 18 Jan 2024 16:01:54 -0500
Subject: [PATCH 02/10] transcript fixes

---
 lhotse/recipes/sbcsae.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py
index f855e2d01..853993e94 100644
--- a/lhotse/recipes/sbcsae.py
+++ b/lhotse/recipes/sbcsae.py
@@ -262,6 +262,8 @@ def _filename_to_supervisions(filename: Path, spk2gen_dict: dict, spk2glob_dict:
         lines = lines.replace("(YAWN0", "(YAWN)")
     elif reco_id == "SBC008":
         lines = lines.replace("[<X Go]=dX>", "[<X Go]=d X>")
+    elif reco_id == "SBC010":
+        lines = lines.replace("366.87 366.87", "366.16 366.87")
     elif reco_id == "SBC012":
         lines = lines.replace(
             "\n".join(["807.02 807.92\tFRANK:  \t.. Mhm."] * 2),
@@ -301,6 +303,10 @@ def _filename_to_supervisions(filename: Path, spk2gen_dict: dict, spk2glob_dict:
         lines = lines.replace("0ttle", "little")
     elif reco_id == "SBC018":
         lines = lines.replace("0f", "if")
+        lines = lines.replace(
+            "129.916\t130.324\tLINDSEY:\tYeah.\n129.915\t130.325\t\t[Mhm.]\n",
+            "129.915\t130.325\tLINDSEY:\t[Mhm.] Yeah.\n",
+        )
     elif reco_id == "SBC019":
         lines = lines.replace("cello_(/cheller/)", "cheller")
         lines = lines.replace("(sigh)", "(SIGH)")

From 80e0a33e50d5b7b8ec9cf73ab6cedfc8c43aa193 Mon Sep 17 00:00:00 2001
From: Matthew Maciejewski <mmaciej2@jhu.edu>
Date: Fri, 26 Jan 2024 16:41:28 -0500
Subject: [PATCH 03/10] added SBCSAE download

---
 lhotse/recipes/__init__.py |   2 +-
 lhotse/recipes/sbcsae.py   | 112 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 111 insertions(+), 3 deletions(-)

diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py
index 2fc7a6f0b..2b38cba61 100644
--- a/lhotse/recipes/__init__.py
+++ b/lhotse/recipes/__init__.py
@@ -63,7 +63,7 @@
 from .nsc import prepare_nsc
 from .peoples_speech import prepare_peoples_speech
 from .rir_noise import download_rir_noise, prepare_rir_noise
-from .sbcsae import prepare_sbcsae
+from .sbcsae import download_sbcsae, prepare_sbcsae
 from .speechcommands import download_speechcommands, prepare_speechcommands
 from .spgispeech import download_spgispeech, prepare_spgispeech
 from .stcmds import download_stcmds, prepare_stcmds
diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py
index 853993e94..01db3a187 100644
--- a/lhotse/recipes/sbcsae.py
+++ b/lhotse/recipes/sbcsae.py
@@ -23,6 +23,7 @@
 """
 import logging
 import re
+import zipfile
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Sequence, Union
 
@@ -31,6 +32,44 @@
 
 TALKBANK_MP3_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/"
 TALKBANK_WAV_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/0wav/"
+UCSB_TRANSCRIPT_URL = "https://www.linguistics.ucsb.edu/sites/secure.lsit.ucsb.edu.ling.d7/files/sitefiles/research/SBC/SBCorpus.zip"
+UCSB_CHAT_URL = "https://www.linguistics.ucsb.edu/sites/secure.lsit.ucsb.edu.ling.d7/files/sitefiles/research/SBC/SBCSAE_chat.zip"
+UCSB_METADATA_URL = "https://www.linguistics.ucsb.edu/sites/secure.lsit.ucsb.edu.ling.d7/files/sitefiles/research/SBC/metadata.zip"
+LDC_DOC_ROOT_URL = "https://catalog.ldc.upenn.edu/docs/"
+LDC_DOCS = {
+    "LDC2000S85": [
+        "segment.tbl",
+        "segment.txt",
+        "speaker.tbl",
+        "speaker.txt",
+    ],
+    "LDC2003S06": [
+        "annotations.txt",
+        "file.tbl",
+        "segment.tbl",
+        "segment.txt",
+        "segment_summaries.txt",
+        "speaker.tbl",
+        "speaker.txt",
+        "table.txt",
+    ],
+    "LDC2004S10": [
+        "annotations.txt",
+        "file.tbl",
+        "segment.tbl",
+        "segment.txt",
+        "segment_summaries.txt",
+        "speaker.tbl",
+        "speaker.txt",
+        "table.txt",
+    ],
+    "LDC2005S25": [
+        "segment.tbl",
+        "segment.txt",
+        "speaker.doc",
+        "speaker.tbl",
+    ],
+}
 
 lang_iterators = {
     "SBC004": iter(["Spanish"] * 17),
@@ -69,6 +108,7 @@ def next(self, spk="SBCXXX_X"):
 def download_sbcsae(
     target_dir: Pathlike = ".",
     download_mp3: Optional[bool] = False,
+    force_download: Optional[bool] = False,
 ) -> Path:
     """
     Download the dataset. Due to availability/broken link issues, this downloads
@@ -84,10 +124,78 @@ def download_sbcsae(
     corpus_dir.mkdir(parents=True, exist_ok=True)
 
     completed_detector = target_dir / ".sbcsae_completed"
-    if completed_detector.is_fil():
+    if completed_detector.is_file():
         logging.info(f"Skipping download because {completed_detector} exists.")
         return corpus_dir
-    return "FALSE"
+
+    # Download audio
+    wav_dir = corpus_dir / "WAV"
+    mp3_dir = corpus_dir / "MP3"
+    wav_dir.mkdir(parents=True, exist_ok=True)
+    mp3_dir.mkdir(parents=True, exist_ok=True)
+    for i in range(1, 61):
+        session = f"{i:02d}"
+        wav_path = wav_dir / ("SBC0" + session + ".wav")
+        resumable_download(
+            TALKBANK_WAV_ROOT_URL + session + ".wav",
+            filename=wav_path,
+            force_download=force_download,
+        )
+        if download_mp3:
+            mp3_path = mp3_dir / ("SBC0" + session + ".mp3")
+            resumable_download(
+                TALKBANK_MP3_ROOT_URL + session + ".mp3",
+                filename=mp3_path,
+                force_download=force_download,
+            )
+
+    # Download annotations
+    transcript_zip = corpus_dir / "TRN.zip"
+    resumable_download(
+        UCSB_TRANSCRIPT_URL, filename=transcript_zip, force_download=force_download
+    )
+    with zipfile.ZipFile(transcript_zip) as f:
+        f.extractall(path=corpus_dir)
+
+    chat_zip = corpus_dir / "CHAT.zip"
+    resumable_download(UCSB_CHAT_URL, filename=chat_zip, force_download=force_download)
+    target_chat_dir = corpus_dir / "CHAT"
+    if target_chat_dir.is_dir():
+        if not any(target_chat_dir.iterdir()):
+            target_chat_dir.rmdir()
+        elif force_download:
+            for item in target_chat_dir.iterdir():
+                item.unlink()
+            target_chat_dir.rmdir()
+    else:
+        with zipfile.ZipFile(chat_zip) as f:
+            f.extractall(path=corpus_dir)
+        chat_dir = corpus_dir / "SBCSAE"
+        chat_dir.rename(corpus_dir / "CHAT")
+
+    metadata_zip = corpus_dir / "metadata.zip"
+    resumable_download(
+        UCSB_METADATA_URL, filename=metadata_zip, force_download=force_download
+    )
+    metadata_dir = corpus_dir / "metadata"
+    metadata_dir.mkdir(parents=True, exist_ok=True)
+    with zipfile.ZipFile(metadata_zip) as f:
+        f.extractall(path=metadata_dir)
+
+    doc_dir = corpus_dir / "documentation"
+    doc_dir.mkdir(parents=True, exist_ok=True)
+    for LDC_split in LDC_DOCS:
+        LDC_dir = doc_dir / LDC_split
+        LDC_dir.mkdir(parents=True, exist_ok=True)
+        for doc_file in LDC_DOCS[LDC_split]:
+            doc_file_url = LDC_DOC_ROOT_URL + LDC_split + "/" + doc_file
+            resumable_download(
+                doc_file_url, filename=LDC_dir / doc_file, force_download=force_download
+            )
+
+    completed_detector.touch()
+
+    return corpus_dir
 
 
 def prepare_sbcsae(

From cb9b0dd1136cdc03e7ee5f8fce766051324d5ea5 Mon Sep 17 00:00:00 2001
From: Matthew Wiesner <wiesner@jhu.edu>
Date: Fri, 15 Mar 2024 15:52:55 -0400
Subject: [PATCH 04/10] Updates sbcsae to properly process mono_channel audio
 and adds speaker origin as geolocations for speakers

---
 lhotse/recipes/sbcsae.py | 115 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 111 insertions(+), 4 deletions(-)

diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py
index 01db3a187..7092f76c0 100644
--- a/lhotse/recipes/sbcsae.py
+++ b/lhotse/recipes/sbcsae.py
@@ -29,7 +29,8 @@
 
 from lhotse import Recording, RecordingSet, SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download
-
+from lhotse import fix_manifests
+from tqdm import tqdm
 TALKBANK_MP3_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/"
 TALKBANK_WAV_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/0wav/"
 UCSB_TRANSCRIPT_URL = "https://www.linguistics.ucsb.edu/sites/secure.lsit.ucsb.edu.ling.d7/files/sitefiles/research/SBC/SBCorpus.zip"
@@ -71,6 +72,7 @@
     ],
 }
 
+
 lang_iterators = {
     "SBC004": iter(["Spanish"] * 17),
     "SBC006": iter(["French"] * 2),
@@ -90,6 +92,32 @@
 }
 
 
+# These corrections to the participant metadata were needed to get geolocations
+# from the geopy package.
+annotation_corrections = {
+    "metro St.L. IL": "Saint Louis MO", # Use the MO side of the city
+    "middle Wes MO": "Missouri", # Just use the state location
+    "S.E.Texas TX": "South East Texas", # The geo package seems to parse this
+    "South Alabama mostly AL": "Andalusia Alabama", # Arbitrarily chosen nearby town
+    "South FL": "South Bay Florida", # Arbitrarily chosen nearby town
+    "Walnut Cre CA": "Walnut Creek CA", # Spelling error
+    "San Leandr CA": "San Leandro CA",
+    "Boston/Santa Fe MA/NM": "Boston/Santa Fe\tMA/NM", # Handle this specially
+    "Boston/New Mexico MA/NM": "Boston/Santa Fe\tMA/NM",
+    "Millstad IL": "Millstadt IL", # Spelling error
+    "Cleveland/San Francisco OH/CA": "Cleveland/San Fransisco\tOH/CA", # Handle specially
+    "Jamesville WI": "Janesville WI", # Spelling error
+    "Falls Church/Albuquerque VA/NM": "Falls Church/Albuquerque\tVA/NM", # Handle specially
+    "Southern Florida": "South Bay Florida", # Arbitarily chosen nearby town
+    "Massachusetts MA": "Massachusetts",
+    "New Zealand n/a": "New Zealand",
+    "French n/a": "France",
+}
+
+
+bad_stereo = ["SBC020","SBC021","SBC027","SBC028"]
+
+
 class Dummy_Spk_Iterator:
     def __init__(self):
         self.ind = 213
@@ -229,16 +257,40 @@ def prepare_sbcsae(
 
     doc_dir = corpus_dir / "documentation"
     spk2gen_dict, spk2glob_dict = generate_speaker_map_dicts(doc_dir)
-
+    spk_coords = generate_geolocations(corpus_dir, spk2glob_dict)
     supervisions = []
     trn_dir = corpus_dir / "TRN"
-    for p in trn_dir.glob("*.trn"):
+    for p in tqdm(list(trn_dir.glob("*.trn")), "Collecting and normalizing transcripts ..."):
         for supervision in _filename_to_supervisions(p, spk2gen_dict, spk2glob_dict):
             supervisions.append(supervision)
 
     if len(supervisions) == 0:
         logging.warning(f"No supervisions found in {trn_dir}")
-    supervisions = SupervisionSet.from_segments(supervisions)
+
+    supervisions_ = []
+    for s in supervisions:
+        # A final check against 0 duration segments though this should not
+        # occur
+        if s.duration < 0.02:
+            s_ = s.pad(pad=0.02)
+        else:
+            s_ = s
+        if s_.speaker in spk_coords:
+            s_.custom = {
+                'lat': spk_coords[s.speaker][0][0],
+                'lon': spk_coords[s.speaker][0][1],
+            }
+
+        if (
+            not isinstance(recordings[s.recording_id].channel_ids, list) or
+            len(recordings[s.recording_id].channel_ids) < 2 or
+            s.recording_id in bad_stereo
+        ):
+            s_.channel = recordings[s.recording_id].channel_ids[0]
+        supervisions_.append(s_) 
+    
+    supervisions = SupervisionSet.from_segments(supervisions_)
+    recordings, supervisions = fix_manifests(recordings, supervisions)
 
     if output_dir is not None:
         if isinstance(output_dir, str):
@@ -252,6 +304,61 @@ def prepare_sbcsae(
     return manifests
 
 
+def generate_geolocations(corpus: Path, spk2glob_dict: dict):
+    if not is_module_available("geopy"):
+        raise ImportError(
+            "geopy package not found. Please install..." " (pip install geopy)"
+        )
+    else:
+        from geopy.geocoders import Nominatim
+        from geopy import geocoders
+
+    speakers = corpus.rglob("documentation/LDC*/speaker.tbl")
+    # This geolocator object is repsonsible for generating a 
+    # latitiude and longitude from a textual description of a location, i.e.,
+    # CHICAGO IL --> (41,-87) 
+    geolocator = Nominatim(user_agent='myapplication')
+    spk_coords = {}
+    for spk in tqdm(list(speakers), "Generating speaker geolocations..."):
+        with open(spk) as f:
+            for l in f:
+                vals = l.strip().split(",")
+                if len(vals) < 5:
+                    continue
+                # Check non-empty
+                empty_hometown = vals[4] in ("", "?")
+                empty_state = vals[5] in ("", "?")
+                if empty_hometown and not empty_state:
+                    loc = vals[5] + ", United States"
+                elif not empty_hometown:
+                    orig_loc = vals[4] + " " + vals[5]
+                    loc = annotation_corrections.get(orig_loc, orig_loc)
+                else:
+                    continue
+                if "/" in loc:
+                    try:
+                        hometowns, states = loc.split("\t", 1)
+                        hometowns = hometowns.split("/")
+                        states = states.split("/")
+                        coords = []
+                        for h, s in zip(hometowns, states):
+                            coords.append(geolocator.geocode(f"{h} {s}", timeout=None)[1])
+                    except ValueError:
+                        states, country = loc.split(",", 1)
+                        coords = []
+                        for s in states.split("/"):
+                            coords.append(geolocator.geocode(f"{s}, {country}", timeout=None)[1])
+                else:
+                    coords = [geolocator.geocode(loc, timeout=None)[1]]
+                spk_coords[vals[0]] = coords
+    spknum2spk_name = {n.split("_")[0]: n for s, n in spk2glob_dict.items()}
+    spk_coords_ = {}
+    for s in spk_coords:
+        if s in spknum2spk_name:
+            spk_coords_[spknum2spk_name[s]] = spk_coords[s]
+    return spk_coords_
+
+
 def generate_speaker_map_dicts(doc_dir: Path):
     spk2gen_dict = dict()
     spk2glob_dict = dict()

From 67d839b7fc5f49038ad0d082e55a7f89391aa8fc Mon Sep 17 00:00:00 2001
From: Matthew Wiesner <wiesner@jhu.edu>
Date: Fri, 15 Mar 2024 16:40:15 -0400
Subject: [PATCH 05/10] Fixes a few 0-width segments by adding 0.02 s of
 padding

---
 lhotse/recipes/sbcsae.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py
index 7092f76c0..1e975e450 100644
--- a/lhotse/recipes/sbcsae.py
+++ b/lhotse/recipes/sbcsae.py
@@ -28,7 +28,9 @@
 from typing import Dict, Iterable, List, Optional, Sequence, Union
 
 from lhotse import Recording, RecordingSet, SupervisionSegment, SupervisionSet
-from lhotse.utils import Pathlike, resumable_download
+from lhotse.utils import (
+    Pathlike, resumable_download, is_module_available, fastcopy,
+)
 from lhotse import fix_manifests
 from tqdm import tqdm
 TALKBANK_MP3_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/"
@@ -257,7 +259,7 @@ def prepare_sbcsae(
 
     doc_dir = corpus_dir / "documentation"
     spk2gen_dict, spk2glob_dict = generate_speaker_map_dicts(doc_dir)
-    spk_coords = generate_geolocations(corpus_dir, spk2glob_dict)
+    #spk_coords = generate_geolocations(corpus_dir, spk2glob_dict)
     supervisions = []
     trn_dir = corpus_dir / "TRN"
     for p in tqdm(list(trn_dir.glob("*.trn")), "Collecting and normalizing transcripts ..."):
@@ -269,12 +271,18 @@ def prepare_sbcsae(
 
     supervisions_ = []
     for s in supervisions:
-        # A final check against 0 duration segments though this should not
-        # occur
         if s.duration < 0.02:
-            s_ = s.pad(pad=0.02)
-        else:
+            # Just pad with a minimum 0.02 duration
+            s_reco = recordings[s.recording_id]
+            new_start = max(0, s.start - 0.01)
+            s_ = fastcopy(
+                s,
+                start=new_start,
+                duration=min(new_start + 0.02, s_reco.duration),
+            )
+        else: 
             s_ = s
+        
         if s_.speaker in spk_coords:
             s_.custom = {
                 'lat': spk_coords[s.speaker][0][0],

From dade60b37feb2e6d40777f3fece3f553952b802c Mon Sep 17 00:00:00 2001
From: Matthew Maciejewski <mmaciej2@jhu.edu>
Date: Tue, 30 Jul 2024 16:57:44 -0400
Subject: [PATCH 06/10] small fix

---
 lhotse/bin/modes/recipes/sbcsae.py |  18 ++-
 lhotse/recipes/sbcsae.py           | 249 +++++++++++------------------
 2 files changed, 105 insertions(+), 162 deletions(-)

diff --git a/lhotse/bin/modes/recipes/sbcsae.py b/lhotse/bin/modes/recipes/sbcsae.py
index 23f36e09b..d447c8db2 100644
--- a/lhotse/bin/modes/recipes/sbcsae.py
+++ b/lhotse/bin/modes/recipes/sbcsae.py
@@ -12,26 +12,34 @@
 @prepare.command(context_settings=dict(show_default=True))
 @click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
 @click.argument("output_dir", type=click.Path())
+@click.option(
+    "--geolocation",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help="Include geographic coordinates of speakers' hometowns in the manifests.",
+)
 def sbcsae(
     corpus_dir: Pathlike,
     output_dir: Pathlike,
+    geolocation: bool,
 ):
     """SBCSAE data preparation."""
-    prepare_sbcsae(corpus_dir, output_dir=output_dir)
+    prepare_sbcsae(corpus_dir, output_dir=output_dir, geolocation=geolocation)
 
 
 @download.command(context_settings=dict(show_default=True))
 @click.argument("target_dir", type=click.Path())
 @click.option(
-    "--download-mp3",
+    "--force-download",
     type=bool,
     is_flag=True,
     default=False,
-    help="Download the mp3 copy of the audio as well as wav.",
+    help="Force download.",
 )
 def sbcsae(
     target_dir: Pathlike,
-    download_mp3: Optional[bool] = False,
+    force_download: bool,
 ):
     """SBCSAE download."""
-    download_sbcsae(target_dir, download_mp3=download_mp3)
+    download_sbcsae(target_dir, force_download=force_download)
diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py
index 1e975e450..aba12d594 100644
--- a/lhotse/recipes/sbcsae.py
+++ b/lhotse/recipes/sbcsae.py
@@ -19,60 +19,45 @@
 (UMass, Boston). For the publication of Parts 3 and 4, the authors are John W.
 Du Bois and Robert Englebretson.
 
-TODO: detail on splits and such
+If you use the corpus or our data preparation scripts, please cite the following:
+@misc{dubois_2005,
+  author={Du Bois, John W. and Chafe, Wallace L. and Meyer, Charles and Thompson, Sandra A. and Englebretson, Robert and Martey, Nii},
+  year={2000--2005},
+  title={{S}anta {B}arbara corpus of spoken {A}merican {E}nglish, {P}arts 1--4},
+  address={Philadelphia},
+  organization={Linguistic Data Consortium},
+}
+@inproceedings{maciejewski24_interspeech,
+  author={Matthew Maciejewski and Dominik Klement and Ruizhe Huang and Matthew Wiesner and Sanjeev Khudanpur},
+  title={Evaluating the {Santa Barbara} Corpus: Challenges of the Breadth of Conversational Spoken Language},
+  year=2024,
+  booktitle={Proc. Interspeech 2024}
+}
 """
 import logging
 import re
-import zipfile
+import tarfile
 from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Sequence, Union
+from typing import Dict, Optional, Union
+
+from tqdm import tqdm
 
-from lhotse import Recording, RecordingSet, SupervisionSegment, SupervisionSet
+from lhotse import (
+    Recording,
+    RecordingSet,
+    SupervisionSegment,
+    SupervisionSet,
+    fix_manifests,
+)
 from lhotse.utils import (
-    Pathlike, resumable_download, is_module_available, fastcopy,
+    Pathlike,
+    fastcopy,
+    is_module_available,
+    resumable_download,
+    safe_extract,
 )
-from lhotse import fix_manifests
-from tqdm import tqdm
-TALKBANK_MP3_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/"
-TALKBANK_WAV_ROOT_URL = "https://media.talkbank.org/ca/SBCSAE/0wav/"
-UCSB_TRANSCRIPT_URL = "https://www.linguistics.ucsb.edu/sites/secure.lsit.ucsb.edu.ling.d7/files/sitefiles/research/SBC/SBCorpus.zip"
-UCSB_CHAT_URL = "https://www.linguistics.ucsb.edu/sites/secure.lsit.ucsb.edu.ling.d7/files/sitefiles/research/SBC/SBCSAE_chat.zip"
-UCSB_METADATA_URL = "https://www.linguistics.ucsb.edu/sites/secure.lsit.ucsb.edu.ling.d7/files/sitefiles/research/SBC/metadata.zip"
-LDC_DOC_ROOT_URL = "https://catalog.ldc.upenn.edu/docs/"
-LDC_DOCS = {
-    "LDC2000S85": [
-        "segment.tbl",
-        "segment.txt",
-        "speaker.tbl",
-        "speaker.txt",
-    ],
-    "LDC2003S06": [
-        "annotations.txt",
-        "file.tbl",
-        "segment.tbl",
-        "segment.txt",
-        "segment_summaries.txt",
-        "speaker.tbl",
-        "speaker.txt",
-        "table.txt",
-    ],
-    "LDC2004S10": [
-        "annotations.txt",
-        "file.tbl",
-        "segment.tbl",
-        "segment.txt",
-        "segment_summaries.txt",
-        "speaker.tbl",
-        "speaker.txt",
-        "table.txt",
-    ],
-    "LDC2005S25": [
-        "segment.tbl",
-        "segment.txt",
-        "speaker.doc",
-        "speaker.tbl",
-    ],
-}
+
+SBCSAE_TAR_URL = "https://www.openslr.org/resources/155/SBCSAE.tar.gz"
 
 
 lang_iterators = {
@@ -97,27 +82,27 @@
 # These corrections to the participant metadata were needed to get geolocations
 # from the geopy package.
 annotation_corrections = {
-    "metro St.L. IL": "Saint Louis MO", # Use the MO side of the city
-    "middle Wes MO": "Missouri", # Just use the state location
-    "S.E.Texas TX": "South East Texas", # The geo package seems to parse this
-    "South Alabama mostly AL": "Andalusia Alabama", # Arbitrarily chosen nearby town
-    "South FL": "South Bay Florida", # Arbitrarily chosen nearby town
-    "Walnut Cre CA": "Walnut Creek CA", # Spelling error
+    "metro St.L. IL": "Saint Louis MO",  # Use the MO side of the city
+    "middle Wes MO": "Missouri",  # Just use the state location
+    "S.E.Texas TX": "South East Texas",  # The geo package seems to parse this
+    "South Alabama mostly AL": "Andalusia Alabama",  # Arbitrarily chosen nearby town
+    "South FL": "South Bay Florida",  # Arbitrarily chosen nearby town
+    "Walnut Cre CA": "Walnut Creek CA",  # Spelling error
     "San Leandr CA": "San Leandro CA",
-    "Boston/Santa Fe MA/NM": "Boston/Santa Fe\tMA/NM", # Handle this specially
+    "Boston/Santa Fe MA/NM": "Boston/Santa Fe\tMA/NM",  # Handle this specially
     "Boston/New Mexico MA/NM": "Boston/Santa Fe\tMA/NM",
-    "Millstad IL": "Millstadt IL", # Spelling error
-    "Cleveland/San Francisco OH/CA": "Cleveland/San Fransisco\tOH/CA", # Handle specially
-    "Jamesville WI": "Janesville WI", # Spelling error
-    "Falls Church/Albuquerque VA/NM": "Falls Church/Albuquerque\tVA/NM", # Handle specially
-    "Southern Florida": "South Bay Florida", # Arbitarily chosen nearby town
+    "Millstad IL": "Millstadt IL",  # Spelling error
+    "Cleveland/San Francisco OH/CA": "Cleveland/San Fransisco\tOH/CA",  # Handle specially
+    "Jamesville WI": "Janesville WI",  # Spelling error
+    "Falls Church/Albuquerque VA/NM": "Falls Church/Albuquerque\tVA/NM",  # Handle specially
+    "Southern Florida": "South Bay Florida",  # Arbitarily chosen nearby town
     "Massachusetts MA": "Massachusetts",
     "New Zealand n/a": "New Zealand",
     "French n/a": "France",
 }
 
 
-bad_stereo = ["SBC020","SBC021","SBC027","SBC028"]
+bad_stereo = ["SBC020", "SBC021", "SBC027", "SBC028"]
 
 
 class Dummy_Spk_Iterator:
@@ -137,93 +122,30 @@ def next(self, spk="SBCXXX_X"):
 
 def download_sbcsae(
     target_dir: Pathlike = ".",
-    download_mp3: Optional[bool] = False,
     force_download: Optional[bool] = False,
 ) -> Path:
     """
-    Download the dataset. Due to availability/broken link issues, this downloads
-    from multiple sources.
+    Download and untar the dataset.
 
     :param: target_dir: Pathlike, the path of the directory where the SBCSAE
         dataset will be downloaded.
-    :param: download_mp3: bool, if True download the mp3 files as well as wav.
+    :param force_download: bool, if True, download the archive even if it already exists.
     :return: The path to the directory with the data.
     """
     target_dir = Path(target_dir)
     corpus_dir = target_dir / "SBCSAE"
     corpus_dir.mkdir(parents=True, exist_ok=True)
+    tar_path = target_dir / "SBCSAE.tar.gz"
 
     completed_detector = target_dir / ".sbcsae_completed"
     if completed_detector.is_file():
         logging.info(f"Skipping download because {completed_detector} exists.")
         return corpus_dir
 
-    # Download audio
-    wav_dir = corpus_dir / "WAV"
-    mp3_dir = corpus_dir / "MP3"
-    wav_dir.mkdir(parents=True, exist_ok=True)
-    mp3_dir.mkdir(parents=True, exist_ok=True)
-    for i in range(1, 61):
-        session = f"{i:02d}"
-        wav_path = wav_dir / ("SBC0" + session + ".wav")
-        resumable_download(
-            TALKBANK_WAV_ROOT_URL + session + ".wav",
-            filename=wav_path,
-            force_download=force_download,
-        )
-        if download_mp3:
-            mp3_path = mp3_dir / ("SBC0" + session + ".mp3")
-            resumable_download(
-                TALKBANK_MP3_ROOT_URL + session + ".mp3",
-                filename=mp3_path,
-                force_download=force_download,
-            )
-
-    # Download annotations
-    transcript_zip = corpus_dir / "TRN.zip"
-    resumable_download(
-        UCSB_TRANSCRIPT_URL, filename=transcript_zip, force_download=force_download
-    )
-    with zipfile.ZipFile(transcript_zip) as f:
-        f.extractall(path=corpus_dir)
-
-    chat_zip = corpus_dir / "CHAT.zip"
-    resumable_download(UCSB_CHAT_URL, filename=chat_zip, force_download=force_download)
-    target_chat_dir = corpus_dir / "CHAT"
-    if target_chat_dir.is_dir():
-        if not any(target_chat_dir.iterdir()):
-            target_chat_dir.rmdir()
-        elif force_download:
-            for item in target_chat_dir.iterdir():
-                item.unlink()
-            target_chat_dir.rmdir()
-    else:
-        with zipfile.ZipFile(chat_zip) as f:
-            f.extractall(path=corpus_dir)
-        chat_dir = corpus_dir / "SBCSAE"
-        chat_dir.rename(corpus_dir / "CHAT")
-
-    metadata_zip = corpus_dir / "metadata.zip"
-    resumable_download(
-        UCSB_METADATA_URL, filename=metadata_zip, force_download=force_download
-    )
-    metadata_dir = corpus_dir / "metadata"
-    metadata_dir.mkdir(parents=True, exist_ok=True)
-    with zipfile.ZipFile(metadata_zip) as f:
-        f.extractall(path=metadata_dir)
-
-    doc_dir = corpus_dir / "documentation"
-    doc_dir.mkdir(parents=True, exist_ok=True)
-    for LDC_split in LDC_DOCS:
-        LDC_dir = doc_dir / LDC_split
-        LDC_dir.mkdir(parents=True, exist_ok=True)
-        for doc_file in LDC_DOCS[LDC_split]:
-            doc_file_url = LDC_DOC_ROOT_URL + LDC_split + "/" + doc_file
-            resumable_download(
-                doc_file_url, filename=LDC_dir / doc_file, force_download=force_download
-            )
-
-    completed_detector.touch()
+    resumable_download(SBCSAE_TAR_URL, filename=tar_path, force_download=force_download)
+    with tarfile.open(tar_path) as tar:
+        safe_extract(tar, path=corpus_dir)
+        completed_detector.touch()
 
     return corpus_dir
 
@@ -231,6 +153,7 @@ def download_sbcsae(
 def prepare_sbcsae(
     corpus_dir: Pathlike,
     output_dir: Optional[Pathlike] = None,
+    geolocation: Optional[bool] = False,
 ) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
     """
     Prepares manifest for SBCSAE dataset.
@@ -240,7 +163,9 @@ def prepare_sbcsae(
         releases of the data. Check script comments for details if using an
         existing corpus download rather than Lhotse's download script.
     :param: output_dir: Root directory where .json manifests are stored.
-    :return:
+    :param: geolocation: Include geographic coordinates of speakers' hometowns
+        in the manifests.
+    :return: The manifests.
     """
     # Resolve corpus_dir type
     if isinstance(corpus_dir, str):
@@ -257,12 +182,18 @@ def prepare_sbcsae(
     if len(recordings) == 0:
         logging.warning(f"No .wav files found in {audio_dir}")
 
-    doc_dir = corpus_dir / "documentation"
+    doc_dir = corpus_dir / "docs"
     spk2gen_dict, spk2glob_dict = generate_speaker_map_dicts(doc_dir)
-    #spk_coords = generate_geolocations(corpus_dir, spk2glob_dict)
+
+    spk_coords = {}
+    if geolocation:
+        spk_coords = generate_geolocations(corpus_dir, spk2glob_dict)
+
     supervisions = []
     trn_dir = corpus_dir / "TRN"
-    for p in tqdm(list(trn_dir.glob("*.trn")), "Collecting and normalizing transcripts ..."):
+    for p in tqdm(
+        list(trn_dir.glob("*.trn")), "Collecting and normalizing transcripts ..."
+    ):
         for supervision in _filename_to_supervisions(p, spk2gen_dict, spk2glob_dict):
             supervisions.append(supervision)
 
@@ -280,23 +211,23 @@ def prepare_sbcsae(
                 start=new_start,
                 duration=min(new_start + 0.02, s_reco.duration),
             )
-        else: 
+        else:
             s_ = s
-        
+
         if s_.speaker in spk_coords:
             s_.custom = {
-                'lat': spk_coords[s.speaker][0][0],
-                'lon': spk_coords[s.speaker][0][1],
+                "lat": spk_coords[s.speaker][0][0],
+                "lon": spk_coords[s.speaker][0][1],
             }
 
         if (
-            not isinstance(recordings[s.recording_id].channel_ids, list) or
-            len(recordings[s.recording_id].channel_ids) < 2 or
-            s.recording_id in bad_stereo
+            not isinstance(recordings[s.recording_id].channel_ids, list)
+            or len(recordings[s.recording_id].channel_ids) < 2
+            or s.recording_id in bad_stereo
         ):
             s_.channel = recordings[s.recording_id].channel_ids[0]
-        supervisions_.append(s_) 
-    
+        supervisions_.append(s_)
+
     supervisions = SupervisionSet.from_segments(supervisions_)
     recordings, supervisions = fix_manifests(recordings, supervisions)
 
@@ -318,14 +249,14 @@ def generate_geolocations(corpus: Path, spk2glob_dict: dict):
             "geopy package not found. Please install..." " (pip install geopy)"
         )
     else:
-        from geopy.geocoders import Nominatim
         from geopy import geocoders
+        from geopy.geocoders import Nominatim
 
-    speakers = corpus.rglob("documentation/LDC*/speaker.tbl")
-    # This geolocator object is repsonsible for generating a 
+    speakers = corpus.rglob("docs/Part_*/speaker.tbl")
+    # This geolocator object is repsonsible for generating a
     # latitiude and longitude from a textual description of a location, i.e.,
-    # CHICAGO IL --> (41,-87) 
-    geolocator = Nominatim(user_agent='myapplication')
+    # CHICAGO IL --> (41,-87)
+    geolocator = Nominatim(user_agent="myapplication")
     spk_coords = {}
     for spk in tqdm(list(speakers), "Generating speaker geolocations..."):
         with open(spk) as f:
@@ -350,12 +281,16 @@ def generate_geolocations(corpus: Path, spk2glob_dict: dict):
                         states = states.split("/")
                         coords = []
                         for h, s in zip(hometowns, states):
-                            coords.append(geolocator.geocode(f"{h} {s}", timeout=None)[1])
+                            coords.append(
+                                geolocator.geocode(f"{h} {s}", timeout=None)[1]
+                            )
                     except ValueError:
                         states, country = loc.split(",", 1)
                         coords = []
                         for s in states.split("/"):
-                            coords.append(geolocator.geocode(f"{s}, {country}", timeout=None)[1])
+                            coords.append(
+                                geolocator.geocode(f"{s}, {country}", timeout=None)[1]
+                            )
                 else:
                     coords = [geolocator.geocode(loc, timeout=None)[1]]
                 spk_coords[vals[0]] = coords
@@ -372,8 +307,8 @@ def generate_speaker_map_dicts(doc_dir: Path):
     spk2glob_dict = dict()
 
     spk_num_to_reco_ids = dict()
-    for LDC_split in ["LDC2000S85", "LDC2003S06", "LDC2005S25"]:
-        filename = doc_dir / LDC_split / "segment.tbl"
+    for part in ["Part_1", "Part_2", "Part_4"]:
+        filename = doc_dir / part / "segment.tbl"
         for line in filename.read_text().split("\n"):
             if "speaker:" in line:
                 line = line.replace(" 0", "\t0")
@@ -384,8 +319,8 @@ def generate_speaker_map_dicts(doc_dir: Path):
                 if reco_id not in spk_num_to_reco_ids[spk_num]:
                     spk_num_to_reco_ids[spk_num].append(reco_id)
 
-    for LDC_split in ["LDC2000S85", "LDC2003S06", "LDC2005S25"]:
-        filename = doc_dir / LDC_split / "speaker.tbl"
+    for part in ["Part_1", "Part_2", "Part_4"]:
+        filename = doc_dir / part / "speaker.tbl"
         for line in filename.read_text().split("\n"):
             if "," not in line:
                 continue
@@ -404,9 +339,9 @@ def generate_speaker_map_dicts(doc_dir: Path):
                 spk2gen_dict[reco + "_" + name] = gen
                 spk2glob_dict[reco + "_" + name] = spk_num + "_" + name
 
-    for LDC_split in ["LDC2004S10"]:
+    for part in ["Part_3"]:
         seg_list = []
-        filename = doc_dir / LDC_split / "segment.tbl"
+        filename = doc_dir / part / "segment.tbl"
         for line in filename.read_text().split("\n"):
             if "speaker:" in line:
                 reco_id = re.sub(r"sbc0?([0-9]{3})\s.*", r"SBC\1", line)
@@ -414,7 +349,7 @@ def generate_speaker_map_dicts(doc_dir: Path):
                 seg_list.append([name, reco_id])
 
         spk_list = []
-        filename = doc_dir / LDC_split / "speaker.tbl"
+        filename = doc_dir / part / "speaker.tbl"
         for line in filename.read_text().split("\n"):
             if "," not in line:
                 continue

From ae9c4588dd0f8580bf42272cd1dba398d0488ce9 Mon Sep 17 00:00:00 2001
From: Dominik Klement <klement.dominik86@gmail.com>
Date: Tue, 17 Sep 2024 14:47:48 -0400
Subject: [PATCH 07/10] Add alignment export option

Exports aligned supervisions along with the original supervisions with or without changing the text after manual inspections and corrections.
---
 lhotse/bin/modes/recipes/sbcsae.py |  22 +++-
 lhotse/recipes/sbcsae.py           | 159 ++++++++++++++++++++++++++++-
 2 files changed, 179 insertions(+), 2 deletions(-)

diff --git a/lhotse/bin/modes/recipes/sbcsae.py b/lhotse/bin/modes/recipes/sbcsae.py
index d447c8db2..3a89291cf 100644
--- a/lhotse/bin/modes/recipes/sbcsae.py
+++ b/lhotse/bin/modes/recipes/sbcsae.py
@@ -19,13 +19,33 @@
     default=False,
     help="Include geographic coordinates of speakers' hometowns in the manifests.",
 )
+@click.option(
+    "--export-alignments",
+    type=bool,
+    is_flag=True,
+    default=True,
+    help="Export re-aligned manifests.",
+)
+@click.option(
+    "--fix-transcripts",
+    type=bool,
+    is_flag=True,
+    default=True,
+    help="Replace transcripts by the the re-aligned ones (with manual fixes applied).",
+)
 def sbcsae(
     corpus_dir: Pathlike,
     output_dir: Pathlike,
     geolocation: bool,
+    export_alignments: bool,
+    fix_transcripts: bool,
 ):
     """SBCSAE data preparation."""
-    prepare_sbcsae(corpus_dir, output_dir=output_dir, geolocation=geolocation)
+    prepare_sbcsae(corpus_dir,
+                   output_dir=output_dir,
+                   geolocation=geolocation,
+                   export_alignments=export_alignments,
+                   fix_transcripts=fix_transcripts)
 
 
 @download.command(context_settings=dict(show_default=True))
diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py
index aba12d594..00a4f5c34 100644
--- a/lhotse/recipes/sbcsae.py
+++ b/lhotse/recipes/sbcsae.py
@@ -34,10 +34,13 @@
   booktitle={Proc. Interspeech 2024}
 }
 """
+from copy import deepcopy
+from dataclasses import dataclass
 import logging
+from math import inf
+from pathlib import Path
 import re
 import tarfile
-from pathlib import Path
 from typing import Dict, Optional, Union
 
 from tqdm import tqdm
@@ -154,6 +157,8 @@ def prepare_sbcsae(
     corpus_dir: Pathlike,
     output_dir: Optional[Pathlike] = None,
     geolocation: Optional[bool] = False,
+    export_alignments: Optional[bool] = True,
+    fix_transcripts: Optional[bool] = True,
 ) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
     """
     Prepares manifest for SBCSAE dataset.
@@ -240,6 +245,21 @@ def prepare_sbcsae(
 
     manifests = {"recordings": recordings, "supervisions": supervisions}
 
+    if export_alignments:
+        asr_supervisions, diar_supervisions = apply_aligned_stms(list(recordings.ids), supervisions,
+                                                                 change_text=fix_transcripts)
+        _, asr_supervisions = fix_manifests(recordings, asr_supervisions)
+        _, diar_supervisions = fix_manifests(recordings, diar_supervisions)
+
+        asr_supervisions.to_file(output_dir / "sbcsae_supervisions_asr_aligned.jsonl.gz")
+        diar_supervisions.to_file(output_dir / "sbcsae_supervisions_diar_aligned.jsonl.gz")
+
+        manifests = {
+            "asr_supervisions": asr_supervisions,
+            "diar_supervisions": diar_supervisions,
+            **manifests
+        }
+
     return manifests
 
 
@@ -960,3 +980,140 @@ def _parse_raw_transcript(transcript: str):
     #    transcript = transcript.replace(" L2>", "")
 
     return transcript, lang_tag
+
+
+@dataclass
+class StmSegment:
+    recording_id: str
+    speaker: str
+    start: float
+    end: float
+    text: str
+    channel: str = "1"
+
+
+def parse_stm_file(data: str) -> list[StmSegment]:
+    lines = data.split("\n")
+    stm_segments = []
+
+    for line in lines:
+        if not line:
+            continue
+
+        fields = line.strip().split()
+        reco_id, channel, speaker = fields[:3]
+        start, end = [float(time) for time in fields[3:5]]
+        text = " ".join(fields[5:])
+
+        stm_segments.append(
+            StmSegment(recording_id=reco_id, speaker=speaker, start=start, end=end, text=text, channel=channel)
+        )
+
+    return stm_segments
+
+
+def retrieve_stm_file(url) -> list[StmSegment]:
+    import urllib.request
+
+    response = urllib.request.urlopen(url)
+    data = response.read().decode("utf-8")
+
+    return parse_stm_file(data)
+
+
+def norm_txt(text: str):
+    text = text.strip()
+    text = text.lower()
+    return text
+
+
+def compute_iou(seg1: SupervisionSegment, seg2: StmSegment) -> float:
+    start = max(seg1.start, seg2.start)
+    end = min(seg1.end, seg2.end)
+
+    intersection = max(0.0, end - start)
+    union = (seg1.end - seg1.start) + (seg2.end - seg2.start) - intersection
+
+    return intersection / union
+
+
+def apply_stm(recording_ids: list[str], supervisions: SupervisionSet, aligned_stm_segs: list[StmSegment],
+              change_text: bool = False) -> SupervisionSet:
+
+    if not is_module_available("intervaltree"):
+        raise ImportError(
+            "intervaltree package not found. Please install..." " (pip install intervaltree)"
+        )
+    else:
+        from intervaltree import IntervalTree
+
+    if not is_module_available("jiwer"):
+        raise ImportError(
+            "jiwer package not found. Please install..." " (pip install jiwer==3.0.4)"
+        )
+    else:
+        from jiwer import cer
+
+    sset = deepcopy(supervisions)
+
+    per_rec_its = {}
+    for rid in recording_ids:
+        per_rec_its[rid] = IntervalTree()
+    for stm_seg in tqdm(aligned_stm_segs, desc="Building interval tree..."):
+        per_rec_its[stm_seg.recording_id][stm_seg.start:stm_seg.end] = stm_seg
+
+    for s in tqdm(sset, desc="Applying STM..."):
+        # We need to find the closest and best-matching segment.
+        # Some labeled segments were misplaced a lot and fixed by manual post-processing.
+        # Hence, in order to find a good match, we tuned collar value to find all matches.
+        # Example: 451 seconds, SBC027 recording.
+        collar = 2.0
+        matching_segments = list(filter(lambda x: x.data.speaker == s.speaker, per_rec_its[s.recording_id][s.start-collar:s.end+collar]))
+        # Alignments used slightly different speaker IDs for UNK speakers, so we relax the speaker ID matching.
+        if not matching_segments:
+            matching_segments = per_rec_its[s.recording_id][s.start-collar:s.end+collar]
+
+        best_cer = inf
+        best_cer_res = None
+        best_matching_seg = None
+        best_iou = 0.0
+
+        for matching_seg in matching_segments:
+            cer_res = cer(norm_txt(s.text), norm_txt(matching_seg.data.text), return_dict=True)
+            cer_val = cer_res["cer"]
+
+            if cer_val < best_cer:
+                best_cer = cer_val
+                best_cer_res = cer_res
+                best_matching_seg = matching_seg
+                best_iou = compute_iou(s, matching_seg.data)
+
+            # There's been an update between the alignments and the lhotse recipe, so some UNK speakers have shifted IDs.
+            # It's enough to match the speaker names (or UNK).
+            if cer_val == best_cer and matching_seg.data.speaker.split("_")[1] == s.speaker.split("_")[1]:
+                current_iou = compute_iou(s, matching_seg.data)
+                if current_iou >= best_iou:
+                    best_matching_seg = matching_seg
+                    best_cer_res = cer_res
+                    best_iou = current_iou
+
+        if s.speaker.split("_")[1] == best_matching_seg.data.speaker.split("_")[1] and best_cer_res["substitutions"] == best_cer_res["deletions"] == 0 and (best_cer < 0.5 or len(s.text) < 3):
+            s.start = best_matching_seg.data.start
+            s.duration = best_matching_seg.data.end - best_matching_seg.data.start
+            if change_text:
+                s.text = best_matching_seg.data.text
+
+            per_rec_its[s.recording_id].remove(best_matching_seg)
+
+    return sset
+
+
+def apply_aligned_stms(recording_ids: list[str], processed_supervisions: SupervisionSet,
+                       change_text: bool = False) -> tuple[SupervisionSet, SupervisionSet]:
+    aligned_for_asr_stm = retrieve_stm_file("https://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_asr.stm")
+    aligned_for_diar_stm = retrieve_stm_file("https://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_diar.stm")
+
+    asr_sup = apply_stm(recording_ids, processed_supervisions, aligned_for_asr_stm, change_text)
+    diar_sup = apply_stm(recording_ids, processed_supervisions, aligned_for_diar_stm, change_text)
+
+    return asr_sup, diar_sup

From be64e93ffe278079a9e6a16b66796d55c10492ba Mon Sep 17 00:00:00 2001
From: Matthew Maciejewski <mmaciej2@jhu.edu>
Date: Wed, 18 Sep 2024 11:39:15 -0400
Subject: [PATCH 08/10] update to cli flags and docs

---
 lhotse/bin/modes/recipes/sbcsae.py | 27 ++++-----
 lhotse/recipes/sbcsae.py           | 94 +++++++++++++++++++++---------
 2 files changed, 75 insertions(+), 46 deletions(-)

diff --git a/lhotse/bin/modes/recipes/sbcsae.py b/lhotse/bin/modes/recipes/sbcsae.py
index 3a89291cf..d6eece516 100644
--- a/lhotse/bin/modes/recipes/sbcsae.py
+++ b/lhotse/bin/modes/recipes/sbcsae.py
@@ -20,32 +20,25 @@
     help="Include geographic coordinates of speakers' hometowns in the manifests.",
 )
 @click.option(
-    "--export-alignments",
+    "--omit-realignments",
     type=bool,
     is_flag=True,
-    default=True,
-    help="Export re-aligned manifests.",
-)
-@click.option(
-    "--fix-transcripts",
-    type=bool,
-    is_flag=True,
-    default=True,
-    help="Replace transcripts by the the re-aligned ones (with manual fixes applied).",
+    default=False,
+    help="Only output the original corpus segmentation without boundary improvements.",
 )
 def sbcsae(
     corpus_dir: Pathlike,
     output_dir: Pathlike,
     geolocation: bool,
-    export_alignments: bool,
-    fix_transcripts: bool,
+    omit_realignments: bool,
 ):
     """SBCSAE data preparation."""
-    prepare_sbcsae(corpus_dir,
-                   output_dir=output_dir,
-                   geolocation=geolocation,
-                   export_alignments=export_alignments,
-                   fix_transcripts=fix_transcripts)
+    prepare_sbcsae(
+        corpus_dir,
+        output_dir=output_dir,
+        geolocation=geolocation,
+        omit_realignments=omit_realignments,
+    )
 
 
 @download.command(context_settings=dict(show_default=True))
diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py
index 00a4f5c34..504e9f337 100644
--- a/lhotse/recipes/sbcsae.py
+++ b/lhotse/recipes/sbcsae.py
@@ -34,13 +34,13 @@
   booktitle={Proc. Interspeech 2024}
 }
 """
+import logging
+import re
+import tarfile
 from copy import deepcopy
 from dataclasses import dataclass
-import logging
 from math import inf
 from pathlib import Path
-import re
-import tarfile
 from typing import Dict, Optional, Union
 
 from tqdm import tqdm
@@ -157,8 +157,7 @@ def prepare_sbcsae(
     corpus_dir: Pathlike,
     output_dir: Optional[Pathlike] = None,
     geolocation: Optional[bool] = False,
-    export_alignments: Optional[bool] = True,
-    fix_transcripts: Optional[bool] = True,
+    omit_realignments: Optional[bool] = False,
 ) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
     """
     Prepares manifest for SBCSAE dataset.
@@ -170,6 +169,7 @@ def prepare_sbcsae(
     :param: output_dir: Root directory where .json manifests are stored.
     :param: geolocation: Include geographic coordinates of speakers' hometowns
         in the manifests.
+    :param: omit_realignments: Only output original corpus segmentation.
     :return: The manifests.
     """
     # Resolve corpus_dir type
@@ -245,19 +245,24 @@ def prepare_sbcsae(
 
     manifests = {"recordings": recordings, "supervisions": supervisions}
 
-    if export_alignments:
-        asr_supervisions, diar_supervisions = apply_aligned_stms(list(recordings.ids), supervisions,
-                                                                 change_text=fix_transcripts)
+    if not omit_realignments:
+        asr_supervisions, diar_supervisions = apply_aligned_stms(
+            list(recordings.ids), supervisions
+        )
         _, asr_supervisions = fix_manifests(recordings, asr_supervisions)
         _, diar_supervisions = fix_manifests(recordings, diar_supervisions)
 
-        asr_supervisions.to_file(output_dir / "sbcsae_supervisions_asr_aligned.jsonl.gz")
-        diar_supervisions.to_file(output_dir / "sbcsae_supervisions_diar_aligned.jsonl.gz")
+        asr_supervisions.to_file(
+            output_dir / "sbcsae_supervisions_asr_aligned.jsonl.gz"
+        )
+        diar_supervisions.to_file(
+            output_dir / "sbcsae_supervisions_diar_aligned.jsonl.gz"
+        )
 
         manifests = {
             "asr_supervisions": asr_supervisions,
             "diar_supervisions": diar_supervisions,
-            **manifests
+            **manifests,
         }
 
     return manifests
@@ -1006,7 +1011,14 @@ def parse_stm_file(data: str) -> list[StmSegment]:
         text = " ".join(fields[5:])
 
         stm_segments.append(
-            StmSegment(recording_id=reco_id, speaker=speaker, start=start, end=end, text=text, channel=channel)
+            StmSegment(
+                recording_id=reco_id,
+                speaker=speaker,
+                start=start,
+                end=end,
+                text=text,
+                channel=channel,
+            )
         )
 
     return stm_segments
@@ -1037,12 +1049,16 @@ def compute_iou(seg1: SupervisionSegment, seg2: StmSegment) -> float:
     return intersection / union
 
 
-def apply_stm(recording_ids: list[str], supervisions: SupervisionSet, aligned_stm_segs: list[StmSegment],
-              change_text: bool = False) -> SupervisionSet:
+def apply_stm(
+    recording_ids: list[str],
+    supervisions: SupervisionSet,
+    aligned_stm_segs: list[StmSegment],
+) -> SupervisionSet:
 
     if not is_module_available("intervaltree"):
         raise ImportError(
-            "intervaltree package not found. Please install..." " (pip install intervaltree)"
+            "intervaltree package not found. Please install..."
+            " (pip install intervaltree)"
         )
     else:
         from intervaltree import IntervalTree
@@ -1060,7 +1076,7 @@ def apply_stm(recording_ids: list[str], supervisions: SupervisionSet, aligned_st
     for rid in recording_ids:
         per_rec_its[rid] = IntervalTree()
     for stm_seg in tqdm(aligned_stm_segs, desc="Building interval tree..."):
-        per_rec_its[stm_seg.recording_id][stm_seg.start:stm_seg.end] = stm_seg
+        per_rec_its[stm_seg.recording_id][stm_seg.start : stm_seg.end] = stm_seg
 
     for s in tqdm(sset, desc="Applying STM..."):
         # We need to find the closest and best-matching segment.
@@ -1068,10 +1084,17 @@ def apply_stm(recording_ids: list[str], supervisions: SupervisionSet, aligned_st
         # Hence, in order to find a good match, we tuned collar value to find all matches.
         # Example: 451 seconds, SBC027 recording.
         collar = 2.0
-        matching_segments = list(filter(lambda x: x.data.speaker == s.speaker, per_rec_its[s.recording_id][s.start-collar:s.end+collar]))
+        matching_segments = list(
+            filter(
+                lambda x: x.data.speaker == s.speaker,
+                per_rec_its[s.recording_id][s.start - collar : s.end + collar],
+            )
+        )
         # Alignments used slightly different speaker IDs for UNK speakers, so we relax the speaker ID matching.
         if not matching_segments:
-            matching_segments = per_rec_its[s.recording_id][s.start-collar:s.end+collar]
+            matching_segments = per_rec_its[s.recording_id][
+                s.start - collar : s.end + collar
+            ]
 
         best_cer = inf
         best_cer_res = None
@@ -1079,7 +1102,9 @@ def apply_stm(recording_ids: list[str], supervisions: SupervisionSet, aligned_st
         best_iou = 0.0
 
         for matching_seg in matching_segments:
-            cer_res = cer(norm_txt(s.text), norm_txt(matching_seg.data.text), return_dict=True)
+            cer_res = cer(
+                norm_txt(s.text), norm_txt(matching_seg.data.text), return_dict=True
+            )
             cer_val = cer_res["cer"]
 
             if cer_val < best_cer:
@@ -1090,30 +1115,41 @@ def apply_stm(recording_ids: list[str], supervisions: SupervisionSet, aligned_st
 
             # There's been an update between the alignments and the lhotse recipe, so some UNK speakers have shifted IDs.
             # It's enough to match the speaker names (or UNK).
-            if cer_val == best_cer and matching_seg.data.speaker.split("_")[1] == s.speaker.split("_")[1]:
+            if (
+                cer_val == best_cer
+                and matching_seg.data.speaker.split("_")[1] == s.speaker.split("_")[1]
+            ):
                 current_iou = compute_iou(s, matching_seg.data)
                 if current_iou >= best_iou:
                     best_matching_seg = matching_seg
                     best_cer_res = cer_res
                     best_iou = current_iou
 
-        if s.speaker.split("_")[1] == best_matching_seg.data.speaker.split("_")[1] and best_cer_res["substitutions"] == best_cer_res["deletions"] == 0 and (best_cer < 0.5 or len(s.text) < 3):
+        if (
+            s.speaker.split("_")[1] == best_matching_seg.data.speaker.split("_")[1]
+            and best_cer_res["substitutions"] == best_cer_res["deletions"] == 0
+            and (best_cer < 0.5 or len(s.text) < 3)
+        ):
             s.start = best_matching_seg.data.start
             s.duration = best_matching_seg.data.end - best_matching_seg.data.start
-            if change_text:
-                s.text = best_matching_seg.data.text
+            s.text = best_matching_seg.data.text
 
             per_rec_its[s.recording_id].remove(best_matching_seg)
 
     return sset
 
 
-def apply_aligned_stms(recording_ids: list[str], processed_supervisions: SupervisionSet,
-                       change_text: bool = False) -> tuple[SupervisionSet, SupervisionSet]:
-    aligned_for_asr_stm = retrieve_stm_file("https://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_asr.stm")
-    aligned_for_diar_stm = retrieve_stm_file("https://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_diar.stm")
+def apply_aligned_stms(
+    recording_ids: list[str], processed_supervisions: SupervisionSet
+) -> tuple[SupervisionSet, SupervisionSet]:
+    aligned_for_asr_stm = retrieve_stm_file(
+        "https://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_asr.stm"
+    )
+    aligned_for_diar_stm = retrieve_stm_file(
+        "https://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_diar.stm"
+    )
 
-    asr_sup = apply_stm(recording_ids, processed_supervisions, aligned_for_asr_stm, change_text)
-    diar_sup = apply_stm(recording_ids, processed_supervisions, aligned_for_diar_stm, change_text)
+    asr_sup = apply_stm(recording_ids, processed_supervisions, aligned_for_asr_stm)
+    diar_sup = apply_stm(recording_ids, processed_supervisions, aligned_for_diar_stm)
 
     return asr_sup, diar_sup

From ceb7ebbf133dc03d275e22b8c570f568da73ae32 Mon Sep 17 00:00:00 2001
From: Matthew Maciejewski <mmaciej2@jhu.edu>
Date: Thu, 3 Oct 2024 09:45:48 -0400
Subject: [PATCH 09/10] added sbcsae to docs and fixed python compatibility

---
 docs/corpus.rst          | 2 ++
 lhotse/recipes/sbcsae.py | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/corpus.rst b/docs/corpus.rst
index 6a5be4f97..bc8d71bfb 100644
--- a/docs/corpus.rst
+++ b/docs/corpus.rst
@@ -173,6 +173,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_reazonspeech`
   * - RIRs and Noises Corpus (OpenSLR 28)
     - :func:`lhotse.recipes.prepare_rir_noise`
+  * - SBCSAE
+    - :func:`lhotse.recipes.prepare_sbcsae`
   * - Spatial-LibriSpeech
     - :func:`lhotse.recipes.prepare_spatial_librispeech`
   * - Speech Commands
diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py
index 504e9f337..d318678ad 100644
--- a/lhotse/recipes/sbcsae.py
+++ b/lhotse/recipes/sbcsae.py
@@ -41,7 +41,7 @@
 from dataclasses import dataclass
 from math import inf
 from pathlib import Path
-from typing import Dict, Optional, Union
+from typing import Dict, List, Optional, Union
 
 from tqdm import tqdm
 
@@ -997,7 +997,7 @@ class StmSegment:
     channel: str = "1"
 
 
-def parse_stm_file(data: str) -> list[StmSegment]:
+def parse_stm_file(data: str) -> List[StmSegment]:
     lines = data.split("\n")
     stm_segments = []
 
@@ -1024,7 +1024,7 @@ def parse_stm_file(data: str) -> list[StmSegment]:
     return stm_segments
 
 
-def retrieve_stm_file(url) -> list[StmSegment]:
+def retrieve_stm_file(url) -> List[StmSegment]:
     import urllib.request
 
     response = urllib.request.urlopen(url)

From 538c4be5b0e9d8f9e3402228cd2e931e8a776987 Mon Sep 17 00:00:00 2001
From: Matthew Maciejewski <mmaciej2@jhu.edu>
Date: Fri, 4 Oct 2024 10:31:12 -0400
Subject: [PATCH 10/10] more python3.8 fixes

---
 lhotse/recipes/sbcsae.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lhotse/recipes/sbcsae.py b/lhotse/recipes/sbcsae.py
index d318678ad..4927548ba 100644
--- a/lhotse/recipes/sbcsae.py
+++ b/lhotse/recipes/sbcsae.py
@@ -41,7 +41,7 @@
 from dataclasses import dataclass
 from math import inf
 from pathlib import Path
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 from tqdm import tqdm
 
@@ -1050,9 +1050,9 @@ def compute_iou(seg1: SupervisionSegment, seg2: StmSegment) -> float:
 
 
 def apply_stm(
-    recording_ids: list[str],
+    recording_ids: List[str],
     supervisions: SupervisionSet,
-    aligned_stm_segs: list[StmSegment],
+    aligned_stm_segs: List[StmSegment],
 ) -> SupervisionSet:
 
     if not is_module_available("intervaltree"):
@@ -1140,8 +1140,8 @@ def apply_stm(
 
 
 def apply_aligned_stms(
-    recording_ids: list[str], processed_supervisions: SupervisionSet
-) -> tuple[SupervisionSet, SupervisionSet]:
+    recording_ids: List[str], processed_supervisions: SupervisionSet
+) -> Tuple[SupervisionSet, SupervisionSet]:
     aligned_for_asr_stm = retrieve_stm_file(
         "https://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_asr.stm"
     )