diff --git a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc.yaml b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc.yaml new file mode 100644 index 000000000..3d02994f5 --- /dev/null +++ b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc.yaml @@ -0,0 +1,73 @@ +name: export-nemo-speaker-verification-to-onnx + +on: + workflow_dispatch: + +concurrency: + group: export-nemo-fast-conformer-hybrid-transducer-ctc-to-onnx-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-nemo-fast-conformer-hybrid-transducer-ctc-to-onnx: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export NeMo fast conformer + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest] + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install NeMo + shell: bash + run: | + BRANCH='main' + pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] + pip install onnxruntime + pip install kaldi-native-fbank + pip install soundfile librosa + + - name: Run + shell: bash + run: | + cd scripts/nemo/fast-conformer-hybrid-transducer-ctc + ./run-ctc.sh + + mv -v sherpa-onnx-nemo* ../../.. + + - name: Download test waves + shell: bash + run: | + mkdir test_wavs + pushd test_wavs + curl -SL -O https://hf-mirror.com/csukuangfj/sherpa-onnx-nemo-ctc-en-conformer-small/resolve/main/test_wavs/0.wav + curl -SL -O https://hf-mirror.com/csukuangfj/sherpa-onnx-nemo-ctc-en-conformer-small/resolve/main/test_wavs/1.wav + curl -SL -O https://hf-mirror.com/csukuangfj/sherpa-onnx-nemo-ctc-en-conformer-small/resolve/main/test_wavs/8k.wav + curl -SL -O https://hf-mirror.com/csukuangfj/sherpa-onnx-nemo-ctc-en-conformer-small/resolve/main/test_wavs/trans.txt + popd + + cp -av test_wavs ./sherpa-onnx-nemo-streaming-fast-conformer-ctc-80ms + cp -av test_wavs ./sherpa-onnx-nemo-streaming-fast-conformer-ctc-480ms + cp -av test_wavs ./sherpa-onnx-nemo-streaming-fast-conformer-ctc-1040ms + + tar cjvf sherpa-onnx-nemo-streaming-fast-conformer-ctc-80ms.tar.bz2 sherpa-onnx-nemo-streaming-fast-conformer-ctc-80ms + tar cjvf sherpa-onnx-nemo-streaming-fast-conformer-ctc-480ms.tar.bz2 sherpa-onnx-nemo-streaming-fast-conformer-ctc-480ms + tar cjvf sherpa-onnx-nemo-streaming-fast-conformer-ctc-1040ms.tar.bz2 sherpa-onnx-nemo-streaming-fast-conformer-ctc-1040ms + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.tar.bz2 + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: asr-models diff --git a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md new file mode 100644 index 000000000..bbb0f861f --- /dev/null +++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md @@ -0,0 +1,9 @@ +# Introduction + +This folder contains scripts for exporting models from + + - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_streaming_80ms + - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_streaming_480ms + - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_streaming_1040ms + +to `sherpa-onnx`. diff --git a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/export-onnx-ctc.py b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/export-onnx-ctc.py new file mode 100755 index 000000000..5751c90b5 --- /dev/null +++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/export-onnx-ctc.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +import argparse +from typing import Dict + +import nemo.collections.asr as nemo_asr +import onnx +import torch + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model", + type=str, + required=True, + choices=["80", "480", "1040"], + ) + return parser.parse_args() + + +def add_meta_data(filename: str, meta_data: Dict[str, str]): + """Add meta data to an ONNX model. It is changed in-place. + + Args: + filename: + Filename of the ONNX model to be changed. + meta_data: + Key-value pairs. + """ + model = onnx.load(filename) + while len(model.metadata_props): + model.metadata_props.pop() + + for key, value in meta_data.items(): + meta = model.metadata_props.add() + meta.key = key + meta.value = str(value) + + onnx.save(model, filename) + + +@torch.no_grad() +def main(): + args = get_args() + model_name = f"stt_en_fastconformer_hybrid_large_streaming_{args.model}ms" + + asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name=model_name) + + with open("./tokens.txt", "w", encoding="utf-8") as f: + for i, s in enumerate(asr_model.joint.vocabulary): + f.write(f"{s} {i}\n") + f.write(f" {i+1}\n") + print("Saved to tokens.txt") + + decoder_type = "ctc" + asr_model.change_decoding_strategy(decoder_type=decoder_type) + asr_model.eval() + + assert asr_model.encoder.streaming_cfg is not None + if isinstance(asr_model.encoder.streaming_cfg.chunk_size, list): + chunk_size = asr_model.encoder.streaming_cfg.chunk_size[1] + else: + chunk_size = asr_model.encoder.streaming_cfg.chunk_size + + if isinstance(asr_model.encoder.streaming_cfg.pre_encode_cache_size, list): + pre_encode_cache_size = asr_model.encoder.streaming_cfg.pre_encode_cache_size[1] + else: + pre_encode_cache_size = asr_model.encoder.streaming_cfg.pre_encode_cache_size + window_size = chunk_size + pre_encode_cache_size + + print("chunk_size", chunk_size) + print("pre_encode_cache_size", pre_encode_cache_size) + print("window_size", window_size) + + chunk_shift = chunk_size + + # cache_last_channel: (batch_size, dim1, dim2, dim3) + cache_last_channel_dim1 = len(asr_model.encoder.layers) + cache_last_channel_dim2 = asr_model.encoder.streaming_cfg.last_channel_cache_size + cache_last_channel_dim3 = asr_model.encoder.d_model + + # cache_last_time: (batch_size, dim1, dim2, dim3) + cache_last_time_dim1 = len(asr_model.encoder.layers) + cache_last_time_dim2 = asr_model.encoder.d_model + cache_last_time_dim3 = asr_model.encoder.conv_context_size[0] + + asr_model.set_export_config({"decoder_type": "ctc", "cache_support": True}) + + filename = "model.onnx" + + asr_model.export(filename) + + meta_data = { + "vocab_size": asr_model.decoder.vocab_size, + "window_size": window_size, + "chunk_shift": chunk_shift, + "normalize_type": "None", + "cache_last_channel_dim1": cache_last_channel_dim1, + "cache_last_channel_dim2": cache_last_channel_dim2, + "cache_last_channel_dim3": cache_last_channel_dim3, + "cache_last_time_dim1": cache_last_time_dim1, + "cache_last_time_dim2": cache_last_time_dim2, + "cache_last_time_dim3": cache_last_time_dim3, + "subsampling_factor": 8, + "model_type": "EncDecHybridRNNTCTCBPEModel", + "version": "1", + "model_author": "NeMo", + "url": f"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/{model_name}", + "comment": "Only the CTC branch is exported", + } + add_meta_data(filename, meta_data) + + print(meta_data) + + +if __name__ == "__main__": + main() diff --git a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc.sh b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc.sh new file mode 100755 index 000000000..6fae9b949 --- /dev/null +++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -e ./0.wav ]; then + # curl -SL -O https://hf-mirror.com/csukuangfj/icefall-asr-librispeech-streaming-zipformer-small-2024-03-18/resolve/main/test_wavs/0.wav + curl -SL -O https://huggingface.co/csukuangfj/icefall-asr-librispeech-streaming-zipformer-small-2024-03-18/resolve/main/test_wavs/0.wav +fi + +ms=( +80 +480 +1040 +) + +for m in ${ms[@]}; do + ./export-onnx-ctc.py --model $m + d=sherpa-onnx-nemo-streaming-fast-conformer-ctc-${m}ms + if [ ! -f $d/model.onnx ]; then + mkdir -p $d + mv -v model.onnx $d/ + mv -v tokens.txt $d/ + ls -lh $d + fi +done + +# Now test the exported models + +for m in ${ms[@]}; do + d=sherpa-onnx-nemo-streaming-fast-conformer-ctc-${m}ms + python3 ./test-onnx-ctc.py \ + --model $d/model.onnx \ + --tokens $d/tokens.txt \ + --wav ./0.wav +done diff --git a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-ctc.py b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-ctc.py new file mode 100755 index 000000000..77c7a526b --- /dev/null +++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-ctc.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 + +import argparse +from pathlib import Path + +import kaldi_native_fbank as knf +import numpy as np +import onnxruntime as ort +import torch +import soundfile as sf +import librosa + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, required=True, help="Path to model.onnx") + + parser.add_argument("--tokens", type=str, required=True, help="Path to tokens.txt") + + parser.add_argument("--wav", type=str, required=True, help="Path to test.wav") + + return parser.parse_args() + + +def create_fbank(): + opts = knf.FbankOptions() + opts.frame_opts.dither = 0 + opts.frame_opts.remove_dc_offset = False + opts.frame_opts.window_type = "hann" + + opts.mel_opts.low_freq = 0 + opts.mel_opts.num_bins = 80 + + opts.mel_opts.is_librosa = True + + fbank = knf.OnlineFbank(opts) + return fbank + + +def compute_features(audio, fbank): + assert len(audio.shape) == 1, audio.shape + fbank.accept_waveform(16000, audio) + ans = [] + processed = 0 + while processed < fbank.num_frames_ready: + ans.append(np.array(fbank.get_frame(processed))) + processed += 1 + ans = np.stack(ans) + return ans + + +class OnnxModel: + def __init__( + self, + filename: str, + ): + session_opts = ort.SessionOptions() + session_opts.inter_op_num_threads = 1 + session_opts.intra_op_num_threads = 1 + + self.session_opts = session_opts + + self.model = ort.InferenceSession( + filename, + sess_options=self.session_opts, + providers=["CPUExecutionProvider"], + ) + + meta = self.model.get_modelmeta().custom_metadata_map + print(meta) + + self.window_size = int(meta["window_size"]) + self.chunk_shift = int(meta["chunk_shift"]) + + self.cache_last_channel_dim1 = int(meta["cache_last_channel_dim1"]) + self.cache_last_channel_dim2 = int(meta["cache_last_channel_dim2"]) + self.cache_last_channel_dim3 = int(meta["cache_last_channel_dim3"]) + + self.cache_last_time_dim1 = int(meta["cache_last_time_dim1"]) + self.cache_last_time_dim2 = int(meta["cache_last_time_dim2"]) + self.cache_last_time_dim3 = int(meta["cache_last_time_dim3"]) + + self.init_cache_state() + + def init_cache_state(self): + self.cache_last_channel = torch.zeros( + 1, + self.cache_last_channel_dim1, + self.cache_last_channel_dim2, + self.cache_last_channel_dim3, + dtype=torch.float32, + ).numpy() + + self.cache_last_time = torch.zeros( + 1, + self.cache_last_time_dim1, + self.cache_last_time_dim2, + self.cache_last_time_dim3, + dtype=torch.float32, + ).numpy() + + self.cache_last_channel_len = torch.ones([1], dtype=torch.int64).numpy() + + def __call__(self, x: np.ndarray): + # x: (T, C) + x = torch.from_numpy(x) + x = x.t().unsqueeze(0) + # x: [1, C, T] + x_lens = torch.tensor([x.shape[-1]], dtype=torch.int64) + + ( + log_probs, + log_probs_len, + cache_last_channel_next, + cache_last_time_next, + cache_last_channel_len_next, + ) = self.model.run( + [ + self.model.get_outputs()[0].name, + self.model.get_outputs()[1].name, + self.model.get_outputs()[2].name, + self.model.get_outputs()[3].name, + self.model.get_outputs()[4].name, + ], + { + self.model.get_inputs()[0].name: x.numpy(), + self.model.get_inputs()[1].name: x_lens.numpy(), + self.model.get_inputs()[2].name: self.cache_last_channel, + self.model.get_inputs()[3].name: self.cache_last_time, + self.model.get_inputs()[4].name: self.cache_last_channel_len, + }, + ) + self.cache_last_channel = cache_last_channel_next + self.cache_last_time = cache_last_time_next + self.cache_last_channel_len = cache_last_channel_len_next + + # [T, vocab_size] + return torch.from_numpy(log_probs).squeeze(0) + + +def main(): + args = get_args() + assert Path(args.model).is_file(), args.model + assert Path(args.tokens).is_file(), args.tokens + assert Path(args.wav).is_file(), args.wav + + print(vars(args)) + + model = OnnxModel(args.model) + + id2token = dict() + with open(args.tokens, encoding="utf-8") as f: + for line in f: + t, idx = line.split() + id2token[int(idx)] = t + + fbank = create_fbank() + audio, sample_rate = sf.read(args.wav, dtype="float32", always_2d=True) + audio = audio[:, 0] # only use the first channel + if sample_rate != 16000: + audio = librosa.resample( + audio, + orig_sr=sample_rate, + target_sr=16000, + ) + sample_rate = 16000 + + window_size = model.window_size + chunk_shift = model.chunk_shift + + blank = len(id2token) - 1 + prev = -1 + ans = [] + + features = compute_features(audio, fbank) + num_chunks = (features.shape[0] - window_size) // chunk_shift + 1 + for i in range(num_chunks): + start = i * chunk_shift + end = start + window_size + chunk = features[start:end, :] + + log_probs = model(chunk) + ids = torch.argmax(log_probs, dim=1).tolist() + for i in ids: + if i != blank and i != prev: + ans.append(i) + prev = i + + tokens = [id2token[i] for i in ans] + underline = "▁" + # underline = b"\xe2\x96\x81".decode() + text = "".join(tokens).replace(underline, " ").strip() + print(args.wav) + print(text) + + +main()