Get timestamps during decoding (#598)

* print out timestamps during decoding * add word-level alignments * support to compute mean symbol delay with word-level alignments * print variance of symbol delay * update doc * support to compute delay for pruned_transducer_stateless4 * fix bug * add doc
k2-fsa · Nov 1, 2022 · 0366877 · 0366877
1 parent ff3f026
commit 0366877
Show file tree

Hide file tree

Showing 8 changed files with 1,094 additions and 150 deletions.
diff --git a/egs/librispeech/ASR/add_alignments.sh b/egs/librispeech/ASR/add_alignments.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+alignments_dir=data/alignment
+cuts_in_dir=data/fbank
+cuts_out_dir=data/fbank_ali
+
+python3 ./local/add_alignment_librispeech.py \
+  --alignments-dir $alignments_dir \
+  --cuts-in-dir $cuts_in_dir \
+  --cuts-out-dir $cuts_out_dir
diff --git a/egs/librispeech/ASR/local/add_alignment_librispeech.py b/egs/librispeech/ASR/local/add_alignment_librispeech.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+# Copyright    2022  Xiaomi Corp.        (authors: Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file adds alignments from https://github.com/CorentinJ/librispeech-alignments  # noqa
+to the existing fbank features dir (e.g., data/fbank)
+and save cuts to a new dir (e.g., data/fbank_ali).
+"""
+
+import argparse
+import logging
+import zipfile
+from pathlib import Path
+from typing import List
+
+from lhotse import CutSet, load_manifest_lazy
+from lhotse.recipes.librispeech import parse_alignments
+from lhotse.utils import is_module_available
+
+LIBRISPEECH_ALIGNMENTS_URL = (
+    "https://drive.google.com/uc?id=1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE"
+)
+
+DATASET_PARTS = [
+    "dev-clean",
+    "dev-other",
+    "test-clean",
+    "test-other",
+    "train-clean-100",
+    "train-clean-360",
+    "train-other-500",
+]
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--alignments-dir",
+        type=str,
+        default="data/alignment",
+        help="The dir to save alignments.",
+    )
+
+    parser.add_argument(
+        "--cuts-in-dir",
+        type=str,
+        default="data/fbank",
+        help="The dir of the existing cuts without alignments.",
+    )
+
+    parser.add_argument(
+        "--cuts-out-dir",
+        type=str,
+        default="data/fbank_ali",
+        help="The dir to save the new cuts with alignments",
+    )
+
+    return parser
+
+
+def download_alignments(
+    target_dir: str, alignments_url: str = LIBRISPEECH_ALIGNMENTS_URL
+):
+    """
+    Download and extract the alignments.
+
+    Note: If you can not access drive.google.com, you could download the file
+    `LibriSpeech-Alignments.zip` from huggingface:
+    https://huggingface.co/Zengwei/librispeech-alignments
+    and extract the zip file manually.
+
+    Args:
+      target_dir:
+        The dir to save alignments.
+      alignments_url:
+        The URL of alignments.
+    """
+    """Modified from https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/librispeech.py"""  # noqa
+    target_dir = Path(target_dir)
+    target_dir.mkdir(parents=True, exist_ok=True)
+    completed_detector = target_dir / ".ali_completed"
+    if completed_detector.is_file():
+        logging.info("The alignment files already exist.")
+        return
+
+    ali_zip_path = target_dir / "LibriSpeech-Alignments.zip"
+    if not ali_zip_path.is_file():
+        assert is_module_available(
+            "gdown"
+        ), 'To download LibriSpeech alignments, please install "pip install gdown"'  # noqa
+        import gdown
+
+        gdown.download(alignments_url, output=str(ali_zip_path))
+
+    with zipfile.ZipFile(str(ali_zip_path)) as f:
+        f.extractall(path=target_dir)
+        completed_detector.touch()
+
+
+def add_alignment(
+    alignments_dir: str,
+    cuts_in_dir: str = "data/fbank",
+    cuts_out_dir: str = "data/fbank_ali",
+    dataset_parts: List[str] = DATASET_PARTS,
+):
+    """
+    Add alignment info to existing cuts.
+
+    Args:
+      alignments_dir:
+        The dir of the alignments.
+      cuts_in_dir:
+        The dir of the existing cuts.
+      cuts_out_dir:
+        The dir to save the new cuts with alignments.
+      dataset_parts:
+        Librispeech parts to add alignments.
+    """
+    alignments_dir = Path(alignments_dir)
+    cuts_in_dir = Path(cuts_in_dir)
+    cuts_out_dir = Path(cuts_out_dir)
+    cuts_out_dir.mkdir(parents=True, exist_ok=True)
+
+    for part in dataset_parts:
+        logging.info(f"Processing {part}")
+
+        cuts_in_path = cuts_in_dir / f"librispeech_cuts_{part}.jsonl.gz"
+        if not cuts_in_path.is_file():
+            logging.info(f"{cuts_in_path} does not exist - skipping.")
+            continue
+        cuts_out_path = cuts_out_dir / f"librispeech_cuts_{part}.jsonl.gz"
+        if cuts_out_path.is_file():
+            logging.info(f"{part} already exists - skipping.")
+            continue
+
+        # parse alignments
+        alignments = {}
+        part_ali_dir = alignments_dir / "LibriSpeech" / part
+        for ali_path in part_ali_dir.rglob("*.alignment.txt"):
+            ali = parse_alignments(ali_path)
+            alignments.update(ali)
+        logging.info(
+            f"{part} has {len(alignments.keys())} cuts with alignments."
+        )
+
+        # add alignment attribute and write out
+        cuts_in = load_manifest_lazy(cuts_in_path)
+        with CutSet.open_writer(cuts_out_path) as writer:
+            for cut in cuts_in:
+                for idx, subcut in enumerate(cut.supervisions):
+                    origin_id = subcut.id.split("_")[0]
+                    if origin_id in alignments:
+                        ali = alignments[origin_id]
+                    else:
+                        logging.info(
+                            f"Warning: {origin_id} does not has alignment."
+                        )
+                        ali = []
+                    subcut.alignment = {"word": ali}
+                writer.write(cut, flush=True)
+
+
+def main():
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    parser = get_parser()
+    args = parser.parse_args()
+    logging.info(vars(args))
+
+    download_alignments(args.alignments_dir)
+    add_alignment(args.alignments_dir, args.cuts_in_dir, args.cuts_out_dir)
+
+
+if __name__ == "__main__":
+    main()