From 2802ef7700d0196ab26bf7375e3dfb3cc20a3dfe Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 10 Oct 2022 09:20:28 +0800 Subject: [PATCH 1/5] Add utility for shallow fusion --- icefall/__init__.py | 2 + icefall/ngram_lm.py | 123 ++++++++++++++++++++++++++++++++++++++++++ test/test_ngram_lm.py | 58 ++++++++++++++++++++ 3 files changed, 183 insertions(+) create mode 100644 icefall/ngram_lm.py create mode 100755 test/test_ngram_lm.py diff --git a/icefall/__init__.py b/icefall/__init__.py index 0399c84592..618f47e6d8 100644 --- a/icefall/__init__.py +++ b/icefall/__init__.py @@ -65,3 +65,5 @@ subsequent_chunk_mask, write_error_stats, ) + +from .ngram_lm import NgramLm diff --git a/icefall/ngram_lm.py b/icefall/ngram_lm.py new file mode 100644 index 0000000000..26ecb49656 --- /dev/null +++ b/icefall/ngram_lm.py @@ -0,0 +1,123 @@ +# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Tuple + +import kaldifst + + +class NgramLm: + def __init__( + self, + binary_fst_filename: str, + backoff_id: int, + ): + """ + Args: + binary_fst_filename: + Path to the binary FST. + backoff_id: + ID of the backoff symbol. + """ + lm = kaldifst.StdVectorFst.read(binary_fst_filename) + if not lm.is_ilabel_sorted: + kaldifst.arcsort(lm, sort_type="ilabel") + + self.lm = lm + self.backoff_id = backoff_id + + def _process_backoff_arcs( + self, + state: int, + cost: float, + ) -> List[Tuple[int, float]]: + """Similar to ProcessNonemitting() from Kaldi, this function + returns the list of states reachable from the given state via + backoff arcs. + + Args: + state: + The input state. + cost: + The cost of reaching the given state from the start state. + Returns: + Return a list, where each element contains a tuple with two entries: + - next_state + - cost of next_state + If there is no backoff arc leaving the input state, then return + an empty list. + """ + ans = [] + + next_state, next_cost = self._get_next_state_and_cost_without_backoff( + state=state, + label=self.backoff_id, + ) + if next_state is None: + return ans + ans.append((next_state, next_cost + cost)) + ans += self._process_backoff_arcs(next_state, next_cost + cost) + return ans + + def _get_next_state_and_cost_without_backoff( + self, state: int, label: int + ) -> Tuple[int, float]: + """TODO: Add doc.""" + arc_iter = kaldifst.ArcIterator(self.lm, state) + num_arcs = self.lm.num_arcs(state) + + # The LM is arc sorted by ilabel, so we use binary search below. + left = 0 + right = num_arcs - 1 + while left <= right: + mid = (left + right) // 2 + arc_iter.seek(mid) + arc = arc_iter.value + if arc.ilabel < label: + left = mid + 1 + elif arc.ilabel > label: + right = mid - 1 + else: + return arc.nextstate, arc.weight.value + + return None, None + + def get_next_state_and_cost( + self, + state: int, + label: int, + ) -> Tuple[List[int], List[float]]: + states = [state] + costs = [0] + + extra_states_costs = self._process_backoff_arcs( + state=state, + cost=0, + ) + + for s, c in extra_states_costs: + states.append(s) + costs.append(c) + + next_states = [] + next_costs = [] + for s, c in zip(states, costs): + ns, nc = self._get_next_state_and_cost_without_backoff(s, label) + if ns: + next_states.append(ns) + next_costs.append(c + nc) + + return next_states, next_costs diff --git a/test/test_ngram_lm.py b/test/test_ngram_lm.py new file mode 100755 index 0000000000..29c58cf13f --- /dev/null +++ b/test/test_ngram_lm.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import graphviz +import kaldifst + +from icefall import NgramLm + + +def generate_fst(filename: str): + s = """ +3 5 1 1 3.00464 +3 0 3 0 5.75646 +0 1 1 1 12.0533 +0 2 2 2 7.95954 +0 9.97787 +1 4 2 2 3.35436 +1 0 3 0 7.59853 +2 0 3 0 +4 2 3 0 7.43735 +4 0.551239 +5 4 2 2 0.804938 +5 1 3 0 9.67086 +""" + fst = kaldifst.compile(s=s, acceptor=False) + fst.write(filename) + fst_dot = kaldifst.draw(fst, acceptor=False, portrait=True) + source = graphviz.Source(fst_dot) + source.render(outfile=f"{filename}.svg") + + +def main(): + filename = "test.fst" + generate_fst(filename) + ngram_lm = NgramLm(filename, backoff_id=3) + for label in [1, 2, 3, 4, 5]: + print("---label---", label) + p = ngram_lm.get_next_state_and_cost(state=5, label=label) + print(p) + print("---") + + +if __name__ == "__main__": + main() From b7782bbe0c4af69aae14a243b8bfcf5fdcafc13f Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 17 Oct 2022 17:29:58 +0800 Subject: [PATCH 2/5] test batch size == 1 without shallow fusion --- .../beam_search.py | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py index 769cd2a1d8..9095e8d521 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py @@ -1539,3 +1539,81 @@ def fast_beam_search_with_nbest_rnn_rescoring( ans[key] = hyps return ans + + +def modified_beam_search2( + model: Transducer, + encoder_out: torch.Tensor, + beam: int = 4, +): + """ """ + + encoder_out = model.joiner.encoder_proj(encoder_out) + + assert encoder_out.ndim == 2, encoder_out.shape + blank_id = model.decoder.blank_id + unk_id = getattr(model, "unk_id", blank_id) + context_size = model.decoder.context_size + device = next(model.parameters()).device + + B = HypothesisList() + B.add( + Hypothesis( + ys=[blank_id] * context_size, + log_prob=torch.zeros(1, dtype=torch.float32, device=device), + ) + ) + + T = encoder_out.shape[0] + for t in range(T): + current_encoder_out = encoder_out[t : t + 1] + A = list(B) + B = HypothesisList() + + ys_log_probs = torch.cat( + [hyp.log_prob.reshape(1, 1) for hyp in A] + ) # (num_hyps, 1) + + decoder_input = torch.tensor( + [hyp.ys[-context_size:] for hyp in A], + device=device, + dtype=torch.int64, + ) # (num_hyps, context_size) + decoder_out = model.decoder(decoder_input, need_pad=False).squeeze(1) + decoder_out = model.joiner.decoder_proj(decoder_out) + + # decoder_out is of shape (num_hyps, joiner_dim) + current_encoder_out = current_encoder_out.repeat(len(A), 1) + # current_encoder_out is of shape (num_hyps, encoder_out_dim) + logits = model.joiner( + current_encoder_out, + decoder_out, + project_input=False, + ) # (num_hyps, vocab_size) + log_probs = logits.log_softmax(dim=-1) # (num_hyps, vocab_size) + log_probs.add_(ys_log_probs) + + vocab_size = log_probs.size(-1) + log_probs = log_probs.reshape(-1) + topk_log_probs, topk_indexes = log_probs.topk(beam) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + topk_hyp_indexes = (topk_indexes // vocab_size).tolist() + topk_token_indexes = (topk_indexes % vocab_size).tolist() + + for k in range(len(topk_hyp_indexes)): + hyp_idx = topk_hyp_indexes[k] + hyp = A[hyp_idx] + new_ys = hyp.ys[:] + + new_token = topk_token_indexes[k] + if new_token not in (blank_id, unk_id): + new_ys.append(new_token) + + new_log_prob = topk_log_probs[k] + new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob) + B.add(new_hyp) + + best_hyp = B.get_most_probable(length_norm=True) + return best_hyp.ys[context_size:] From 96bb44eb046de484d4e1599161e5865935da849c Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 18 Oct 2022 23:42:11 +0800 Subject: [PATCH 3/5] Use shallow fusion for modified-beam-search --- egs/librispeech/ASR/generate-lm.sh | 32 ++++++++++++ .../ASR/lstm_transducer_stateless2/decode.py | 33 ++++++++++++ .../beam_search.py | 28 ++++++++-- icefall/__init__.py | 2 +- icefall/ngram_lm.py | 51 +++++++++++++++++-- test/test_ngram_lm.py | 14 ++++- 6 files changed, 147 insertions(+), 13 deletions(-) create mode 100755 egs/librispeech/ASR/generate-lm.sh diff --git a/egs/librispeech/ASR/generate-lm.sh b/egs/librispeech/ASR/generate-lm.sh new file mode 100755 index 0000000000..9dc1055da8 --- /dev/null +++ b/egs/librispeech/ASR/generate-lm.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +lang_dir=data/lang_bpe_500 +if [ ! -f $lang_dir/bigram.arpa ]; then + ./shared/make_kn_lm.py \ + -ngram-order 2 \ + -text $lang_dir/transcript_tokens.txt \ + -lm $lang_dir/bigram.arpa +fi + +if [ ! -f $lang_dir/bigram.fst.txt ]; then + python3 -m kaldilm \ + --read-symbol-table="$lang_dir/tokens.txt" \ + --disambig-symbol='#0' \ + --max-order=2 \ + $lang_dir/bigram.arpa > $lang_dir/bigram.fst.txt +fi + +if [ ! -f $lang_dir/trigram.arpa ]; then + ./shared/make_kn_lm.py \ + -ngram-order 3 \ + -text $lang_dir/transcript_tokens.txt \ + -lm $lang_dir/trigram.arpa +fi + +if [ ! -f $lang_dir/trigram.fst.txt ]; then + python3 -m kaldilm \ + --read-symbol-table="$lang_dir/tokens.txt" \ + --disambig-symbol='#0' \ + --max-order=3 \ + $lang_dir/trigram.arpa > $lang_dir/trigram.fst.txt +fi diff --git a/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py index 420202cade..4aa9ed862a 100755 --- a/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py +++ b/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py @@ -115,10 +115,12 @@ greedy_search, greedy_search_batch, modified_beam_search, + modified_beam_search2, ) from librispeech import LibriSpeech from train import add_model_arguments, get_params, get_transducer_model +from icefall import NgramLm from icefall.checkpoint import ( average_checkpoints, average_checkpoints_with_averaged_model, @@ -315,6 +317,8 @@ def decode_one_batch( batch: dict, word_table: Optional[k2.SymbolTable] = None, decoding_graph: Optional[k2.Fsa] = None, + ngram_lm: Optional[NgramLm] = None, + ngram_lm_scale: float = 1.0, ) -> Dict[str, List[List[str]]]: """Decode one batch and return the result in a dict. The dict has the following format: @@ -448,6 +452,17 @@ def decode_one_batch( ) for hyp in sp.decode(hyp_tokens): hyps.append(hyp.split()) + elif params.decoding_method == "modified_beam_search2": + batch_size = encoder_out.size(0) + for i in range(batch_size): + encoder_out_i = encoder_out[i, : encoder_out_lens[i]] + hyp = modified_beam_search2( + model=model, + encoder_out=encoder_out_i, + ngram_lm=ngram_lm, + ngram_lm_scale=ngram_lm_scale, + ) + hyps.append(sp.decode(hyp).split()) else: batch_size = encoder_out.size(0) @@ -497,6 +512,8 @@ def decode_dataset( sp: spm.SentencePieceProcessor, word_table: Optional[k2.SymbolTable] = None, decoding_graph: Optional[k2.Fsa] = None, + ngram_lm: Optional[NgramLm] = None, + ngram_lm_scale: float = 1.0, ) -> Dict[str, List[Tuple[str, List[str], List[str]]]]: """Decode dataset. @@ -546,6 +563,8 @@ def decode_dataset( decoding_graph=decoding_graph, word_table=word_table, batch=batch, + ngram_lm=ngram_lm, + ngram_lm_scale=ngram_lm_scale, ) for name, hyps in hyps_dict.items(): @@ -631,6 +650,7 @@ def main(): "fast_beam_search_nbest_LG", "fast_beam_search_nbest_oracle", "modified_beam_search", + "modified_beam_search2", ) params.res_dir = params.exp_dir / params.decoding_method @@ -655,6 +675,7 @@ def main(): else: params.suffix += f"-context-{params.context_size}" params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}" + params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}" if params.use_averaged_model: params.suffix += "-use-averaged-model" @@ -768,6 +789,16 @@ def main(): model.to(device) model.eval() + # lm_filename = "bigram.fst.txt" + lm_filename = "trigram.fst.txt" + logging.info(f"lm filename: {lm_filename}") + ngram_lm = NgramLm( + str(params.lang_dir / lm_filename), + backoff_id=500, + is_binary=False, + ) + logging.info(f"num states: {ngram_lm.lm.num_states}") + if "fast_beam_search" in params.decoding_method: if params.decoding_method == "fast_beam_search_nbest_LG": lexicon = Lexicon(params.lang_dir) @@ -812,6 +843,8 @@ def main(): sp=sp, word_table=word_table, decoding_graph=decoding_graph, + ngram_lm=ngram_lm, + ngram_lm_scale=params.ngram_lm_scale, ) save_results( diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py index 9095e8d521..36f159cf7f 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py @@ -23,6 +23,7 @@ import torch from model import Transducer +from icefall import NgramLm, NgramLmStateCost from icefall.decode import Nbest, one_best_decoding from icefall.utils import add_eos, add_sos, get_texts @@ -656,6 +657,8 @@ class Hypothesis: # It contains only one entry. log_prob: torch.Tensor + state_cost: Optional[NgramLmStateCost] = None + @property def key(self) -> str: """Return a string representation of self.ys""" @@ -1544,12 +1547,14 @@ def fast_beam_search_with_nbest_rnn_rescoring( def modified_beam_search2( model: Transducer, encoder_out: torch.Tensor, + ngram_lm: NgramLm, + ngram_lm_scale: float, beam: int = 4, ): - """ """ - encoder_out = model.joiner.encoder_proj(encoder_out) + lm_scale = ngram_lm_scale + assert encoder_out.ndim == 2, encoder_out.shape blank_id = model.decoder.blank_id unk_id = getattr(model, "unk_id", blank_id) @@ -1561,6 +1566,7 @@ def modified_beam_search2( Hypothesis( ys=[blank_id] * context_size, log_prob=torch.zeros(1, dtype=torch.float32, device=device), + state_cost=NgramLmStateCost(ngram_lm), ) ) @@ -1571,7 +1577,10 @@ def modified_beam_search2( B = HypothesisList() ys_log_probs = torch.cat( - [hyp.log_prob.reshape(1, 1) for hyp in A] + [ + hyp.log_prob.reshape(1, 1) + hyp.state_cost.lm_score * lm_scale + for hyp in A + ] ) # (num_hyps, 1) decoder_input = torch.tensor( @@ -1610,9 +1619,18 @@ def modified_beam_search2( new_token = topk_token_indexes[k] if new_token not in (blank_id, unk_id): new_ys.append(new_token) + state_cost = hyp.state_cost.forward_one_step(new_token) + else: + state_cost = hyp.state_cost - new_log_prob = topk_log_probs[k] - new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob) + # We only keep AM scores in new_hyp.log_prob + new_log_prob = ( + topk_log_probs[k] - hyp.state_cost.lm_score * lm_scale + ) + + new_hyp = Hypothesis( + ys=new_ys, log_prob=new_log_prob, state_cost=state_cost + ) B.add(new_hyp) best_hyp = B.get_most_probable(length_norm=True) diff --git a/icefall/__init__.py b/icefall/__init__.py index 618f47e6d8..122226fdc9 100644 --- a/icefall/__init__.py +++ b/icefall/__init__.py @@ -66,4 +66,4 @@ write_error_stats, ) -from .ngram_lm import NgramLm +from .ngram_lm import NgramLm, NgramLmStateCost diff --git a/icefall/ngram_lm.py b/icefall/ngram_lm.py index 26ecb49656..23185e35ab 100644 --- a/icefall/ngram_lm.py +++ b/icefall/ngram_lm.py @@ -14,7 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Tuple +from collections import defaultdict +from typing import List, Optional, Tuple import kaldifst @@ -22,17 +23,25 @@ class NgramLm: def __init__( self, - binary_fst_filename: str, + fst_filename: str, backoff_id: int, + is_binary: bool = False, ): """ Args: - binary_fst_filename: - Path to the binary FST. + fst_filename: + Path to the FST. backoff_id: ID of the backoff symbol. + is_binary: + True if the given file is a binary FST. """ - lm = kaldifst.StdVectorFst.read(binary_fst_filename) + if is_binary: + lm = kaldifst.StdVectorFst.read(fst_filename) + else: + with open(fst_filename, "r") as f: + lm = kaldifst.compile(f.read(), acceptor=False) + if not lm.is_ilabel_sorted: kaldifst.arcsort(lm, sort_type="ilabel") @@ -121,3 +130,35 @@ def get_next_state_and_cost( next_costs.append(c + nc) return next_states, next_costs + + +class NgramLmStateCost: + def __init__(self, ngram_lm: NgramLm, state_cost: Optional[dict] = None): + assert ngram_lm.lm.start == 0, ngram_lm.lm.start + self.ngram_lm = ngram_lm + if state_cost is not None: + self.state_cost = state_cost + else: + self.state_cost = defaultdict(lambda: float("inf")) + + # At the very beginning, we are at the start state with cost 0 + self.state_cost[0] = 0.0 + + def forward_one_step(self, label: int) -> "NgramLmStateCost": + state_cost = defaultdict(lambda: float("inf")) + for s, c in self.state_cost.items(): + next_states, next_costs = self.ngram_lm.get_next_state_and_cost( + s, + label, + ) + for ns, nc in zip(next_states, next_costs): + state_cost[ns] = min(state_cost[ns], c + nc) + + return NgramLmStateCost(ngram_lm=self.ngram_lm, state_cost=state_cost) + + @property + def lm_score(self) -> float: + if len(self.state_cost) == 0: + return float("-inf") + + return -1 * min(self.state_cost.values()) diff --git a/test/test_ngram_lm.py b/test/test_ngram_lm.py index 29c58cf13f..bbf6bd51c2 100755 --- a/test/test_ngram_lm.py +++ b/test/test_ngram_lm.py @@ -18,7 +18,7 @@ import graphviz import kaldifst -from icefall import NgramLm +from icefall import NgramLm, NgramLmStateCost def generate_fst(filename: str): @@ -46,13 +46,23 @@ def generate_fst(filename: str): def main(): filename = "test.fst" generate_fst(filename) - ngram_lm = NgramLm(filename, backoff_id=3) + ngram_lm = NgramLm(filename, backoff_id=3, is_binary=True) for label in [1, 2, 3, 4, 5]: print("---label---", label) p = ngram_lm.get_next_state_and_cost(state=5, label=label) print(p) print("---") + state_cost = NgramLmStateCost(ngram_lm) + s0 = state_cost.forward_one_step(1) + print(s0.state_cost) + + s1 = s0.forward_one_step(2) + print(s1.state_cost) + + s2 = s1.forward_one_step(2) + print(s2.state_cost) + if __name__ == "__main__": main() From 13411dbee0864a85b68eee20ddf0e132a7ad33d1 Mon Sep 17 00:00:00 2001 From: Erwan Date: Thu, 20 Oct 2022 21:53:41 +0200 Subject: [PATCH 4/5] Modified beam search with ngram rescoring --- egs/librispeech/ASR/generate-lm.sh | 32 +++- .../ASR/lstm_transducer_stateless2/decode.py | 30 ++-- .../beam_search.py | 149 +++++++++++++----- 3 files changed, 152 insertions(+), 59 deletions(-) diff --git a/egs/librispeech/ASR/generate-lm.sh b/egs/librispeech/ASR/generate-lm.sh index 9dc1055da8..93e3fe4e68 100755 --- a/egs/librispeech/ASR/generate-lm.sh +++ b/egs/librispeech/ASR/generate-lm.sh @@ -1,32 +1,48 @@ #!/usr/bin/env bash lang_dir=data/lang_bpe_500 -if [ ! -f $lang_dir/bigram.arpa ]; then +if [ ! -f $lang_dir/2gram.arpa ]; then ./shared/make_kn_lm.py \ -ngram-order 2 \ -text $lang_dir/transcript_tokens.txt \ - -lm $lang_dir/bigram.arpa + -lm $lang_dir/2gram.arpa fi -if [ ! -f $lang_dir/bigram.fst.txt ]; then +if [ ! -f $lang_dir/2gram.fst.txt ]; then python3 -m kaldilm \ --read-symbol-table="$lang_dir/tokens.txt" \ --disambig-symbol='#0' \ --max-order=2 \ - $lang_dir/bigram.arpa > $lang_dir/bigram.fst.txt + $lang_dir/2gram.arpa > $lang_dir/2gram.fst.txt fi -if [ ! -f $lang_dir/trigram.arpa ]; then +if [ ! -f $lang_dir/3gram.arpa ]; then ./shared/make_kn_lm.py \ -ngram-order 3 \ -text $lang_dir/transcript_tokens.txt \ - -lm $lang_dir/trigram.arpa + -lm $lang_dir/3gram.arpa fi -if [ ! -f $lang_dir/trigram.fst.txt ]; then +if [ ! -f $lang_dir/3gram.fst.txt ]; then python3 -m kaldilm \ --read-symbol-table="$lang_dir/tokens.txt" \ --disambig-symbol='#0' \ --max-order=3 \ - $lang_dir/trigram.arpa > $lang_dir/trigram.fst.txt + $lang_dir/3gram.arpa > $lang_dir/3gram.fst.txt +fi + + +if [ ! -f $lang_dir/5gram.arpa ]; then + ./shared/make_kn_lm.py \ + -ngram-order 5 \ + -text $lang_dir/transcript_tokens.txt \ + -lm $lang_dir/5gram.arpa +fi + +if [ ! -f $lang_dir/5gram.fst.txt ]; then + python3 -m kaldilm \ + --read-symbol-table="$lang_dir/tokens.txt" \ + --disambig-symbol='#0' \ + --max-order=5 \ + $lang_dir/5gram.arpa > $lang_dir/5gram.fst.txt fi diff --git a/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py index 4aa9ed862a..2d8b54a7e8 100755 --- a/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py +++ b/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py @@ -115,7 +115,7 @@ greedy_search, greedy_search_batch, modified_beam_search, - modified_beam_search2, + modified_beam_search_ngram_rescoring, ) from librispeech import LibriSpeech from train import add_model_arguments, get_params, get_transducer_model @@ -216,6 +216,7 @@ def get_parser(): - fast_beam_search_nbest - fast_beam_search_nbest_oracle - fast_beam_search_nbest_LG + - modified_beam_search_ngram_rescoring If you use fast_beam_search_nbest_LG, you have to specify `--lang-dir`, which should contain `LG.pt`. """, @@ -452,17 +453,17 @@ def decode_one_batch( ) for hyp in sp.decode(hyp_tokens): hyps.append(hyp.split()) - elif params.decoding_method == "modified_beam_search2": - batch_size = encoder_out.size(0) - for i in range(batch_size): - encoder_out_i = encoder_out[i, : encoder_out_lens[i]] - hyp = modified_beam_search2( - model=model, - encoder_out=encoder_out_i, - ngram_lm=ngram_lm, - ngram_lm_scale=ngram_lm_scale, - ) - hyps.append(sp.decode(hyp).split()) + elif params.decoding_method == "modified_beam_search_ngram_rescoring": + hyp_tokens = modified_beam_search_ngram_rescoring( + model=model, + encoder_out=encoder_out, + encoder_out_lens=encoder_out_lens, + ngram_lm=ngram_lm, + ngram_lm_scale=ngram_lm_scale, + beam=params.beam_size, + ) + for hyp in sp.decode(hyp_tokens): + hyps.append(hyp.split()) else: batch_size = encoder_out.size(0) @@ -650,7 +651,7 @@ def main(): "fast_beam_search_nbest_LG", "fast_beam_search_nbest_oracle", "modified_beam_search", - "modified_beam_search2", + "modified_beam_search_ngram_rescoring", ) params.res_dir = params.exp_dir / params.decoding_method @@ -789,8 +790,7 @@ def main(): model.to(device) model.eval() - # lm_filename = "bigram.fst.txt" - lm_filename = "trigram.fst.txt" + lm_filename = "5ram.fst.txt" logging.info(f"lm filename: {lm_filename}") ngram_lm = NgramLm( str(params.lang_dir / lm_filename), diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py index 36f159cf7f..c70618ef76 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py @@ -1544,78 +1544,147 @@ def fast_beam_search_with_nbest_rnn_rescoring( return ans -def modified_beam_search2( +def modified_beam_search_ngram_rescoring( model: Transducer, encoder_out: torch.Tensor, + encoder_out_lens: torch.Tensor, ngram_lm: NgramLm, ngram_lm_scale: float, beam: int = 4, -): - encoder_out = model.joiner.encoder_proj(encoder_out) + temperature: float = 1.0, +) -> List[List[int]]: + """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded. - lm_scale = ngram_lm_scale + Args: + model: + The transducer model. + encoder_out: + Output from the encoder. Its shape is (N, T, C). + encoder_out_lens: + A 1-D tensor of shape (N,), containing number of valid frames in + encoder_out before padding. + beam: + Number of active paths during the beam search. + temperature: + Softmax temperature. + Returns: + Return a list-of-list of token IDs. ans[i] is the decoding results + for the i-th utterance. + """ + assert encoder_out.ndim == 3, encoder_out.shape + assert encoder_out.size(0) >= 1, encoder_out.size(0) + + packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence( + input=encoder_out, + lengths=encoder_out_lens.cpu(), + batch_first=True, + enforce_sorted=False, + ) - assert encoder_out.ndim == 2, encoder_out.shape blank_id = model.decoder.blank_id unk_id = getattr(model, "unk_id", blank_id) context_size = model.decoder.context_size device = next(model.parameters()).device + lm_scale = ngram_lm_scale - B = HypothesisList() - B.add( - Hypothesis( - ys=[blank_id] * context_size, - log_prob=torch.zeros(1, dtype=torch.float32, device=device), - state_cost=NgramLmStateCost(ngram_lm), + batch_size_list = packed_encoder_out.batch_sizes.tolist() + N = encoder_out.size(0) + assert torch.all(encoder_out_lens > 0), encoder_out_lens + assert N == batch_size_list[0], (N, batch_size_list) + + B = [HypothesisList() for _ in range(N)] + for i in range(N): + B[i].add( + Hypothesis( + ys=[blank_id] * context_size, + log_prob=torch.zeros(1, dtype=torch.float32, device=device), + state_cost=NgramLmStateCost(ngram_lm), + ) ) - ) - T = encoder_out.shape[0] - for t in range(T): - current_encoder_out = encoder_out[t : t + 1] - A = list(B) - B = HypothesisList() + encoder_out = model.joiner.encoder_proj(packed_encoder_out.data) + + offset = 0 + finalized_B = [] + for batch_size in batch_size_list: + start = offset + end = offset + batch_size + current_encoder_out = encoder_out.data[start:end] + current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1) + # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim) + offset = end + + finalized_B = B[batch_size:] + finalized_B + B = B[:batch_size] + + hyps_shape = get_hyps_shape(B).to(device) + + A = [list(b) for b in B] + B = [HypothesisList() for _ in range(batch_size)] ys_log_probs = torch.cat( [ hyp.log_prob.reshape(1, 1) + hyp.state_cost.lm_score * lm_scale - for hyp in A + for hyps in A + for hyp in hyps ] ) # (num_hyps, 1) decoder_input = torch.tensor( - [hyp.ys[-context_size:] for hyp in A], + [hyp.ys[-context_size:] for hyps in A for hyp in hyps], device=device, dtype=torch.int64, ) # (num_hyps, context_size) - decoder_out = model.decoder(decoder_input, need_pad=False).squeeze(1) + + decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1) decoder_out = model.joiner.decoder_proj(decoder_out) + # decoder_out is of shape (num_hyps, 1, 1, joiner_dim) + + # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor + # as index, so we use `to(torch.int64)` below. + current_encoder_out = torch.index_select( + current_encoder_out, + dim=0, + index=hyps_shape.row_ids(1).to(torch.int64), + ) # (num_hyps, 1, 1, encoder_out_dim) - # decoder_out is of shape (num_hyps, joiner_dim) - current_encoder_out = current_encoder_out.repeat(len(A), 1) - # current_encoder_out is of shape (num_hyps, encoder_out_dim) logits = model.joiner( current_encoder_out, decoder_out, project_input=False, - ) # (num_hyps, vocab_size) - log_probs = logits.log_softmax(dim=-1) # (num_hyps, vocab_size) - log_probs.add_(ys_log_probs) + ) # (num_hyps, 1, 1, vocab_size) + + logits = logits.squeeze(1).squeeze(1) # (num_hyps, vocab_size) + + log_probs = (logits / temperature).log_softmax( + dim=-1 + ) # (num_hyps, vocab_size) + log_probs.add_(ys_log_probs) vocab_size = log_probs.size(-1) log_probs = log_probs.reshape(-1) - topk_log_probs, topk_indexes = log_probs.topk(beam) - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - topk_hyp_indexes = (topk_indexes // vocab_size).tolist() - topk_token_indexes = (topk_indexes % vocab_size).tolist() + row_splits = hyps_shape.row_splits(1) * vocab_size + log_probs_shape = k2.ragged.create_ragged_shape2( + row_splits=row_splits, cached_tot_size=log_probs.numel() + ) + ragged_log_probs = k2.RaggedTensor( + shape=log_probs_shape, value=log_probs + ) + + for i in range(batch_size): + topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + topk_hyp_indexes = (topk_indexes // vocab_size).tolist() + topk_token_indexes = (topk_indexes % vocab_size).tolist() for k in range(len(topk_hyp_indexes)): hyp_idx = topk_hyp_indexes[k] - hyp = A[hyp_idx] - new_ys = hyp.ys[:] + hyp = A[i][hyp_idx] + new_ys = hyp.ys[:] new_token = topk_token_indexes[k] if new_token not in (blank_id, unk_id): new_ys.append(new_token) @@ -1631,7 +1700,15 @@ def modified_beam_search2( new_hyp = Hypothesis( ys=new_ys, log_prob=new_log_prob, state_cost=state_cost ) - B.add(new_hyp) + B[i].add(new_hyp) - best_hyp = B.get_most_probable(length_norm=True) - return best_hyp.ys[context_size:] + B = B + finalized_B + best_hyps = [b.get_most_probable(length_norm=True) for b in B] + + sorted_ans = [h.ys[context_size:] for h in best_hyps] + ans = [] + unsorted_indices = packed_encoder_out.unsorted_indices.tolist() + for i in range(N): + ans.append(sorted_ans[unsorted_indices[i]]) + + return ans From f5f3cf28694b9fd7818347975a5723baf91508f5 Mon Sep 17 00:00:00 2001 From: Erwan Date: Fri, 21 Oct 2022 09:45:41 +0200 Subject: [PATCH 5/5] Fix code according to review --- egs/librispeech/ASR/generate-lm.sh | 58 +++++-------------- .../ASR/lstm_transducer_stateless2/decode.py | 20 ++++++- 2 files changed, 33 insertions(+), 45 deletions(-) diff --git a/egs/librispeech/ASR/generate-lm.sh b/egs/librispeech/ASR/generate-lm.sh index 93e3fe4e68..6baccd3810 100755 --- a/egs/librispeech/ASR/generate-lm.sh +++ b/egs/librispeech/ASR/generate-lm.sh @@ -1,48 +1,20 @@ #!/usr/bin/env bash lang_dir=data/lang_bpe_500 -if [ ! -f $lang_dir/2gram.arpa ]; then - ./shared/make_kn_lm.py \ - -ngram-order 2 \ - -text $lang_dir/transcript_tokens.txt \ - -lm $lang_dir/2gram.arpa -fi -if [ ! -f $lang_dir/2gram.fst.txt ]; then - python3 -m kaldilm \ - --read-symbol-table="$lang_dir/tokens.txt" \ - --disambig-symbol='#0' \ - --max-order=2 \ - $lang_dir/2gram.arpa > $lang_dir/2gram.fst.txt -fi +for ngram in 2 3 5; do + if [ ! -f $lang_dir/${ngram}gram.arpa ]; then + ./shared/make_kn_lm.py \ + -ngram-order ${ngram} \ + -text $lang_dir/transcript_tokens.txt \ + -lm $lang_dir/${ngram}gram.arpa + fi -if [ ! -f $lang_dir/3gram.arpa ]; then - ./shared/make_kn_lm.py \ - -ngram-order 3 \ - -text $lang_dir/transcript_tokens.txt \ - -lm $lang_dir/3gram.arpa -fi - -if [ ! -f $lang_dir/3gram.fst.txt ]; then - python3 -m kaldilm \ - --read-symbol-table="$lang_dir/tokens.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lang_dir/3gram.arpa > $lang_dir/3gram.fst.txt -fi - - -if [ ! -f $lang_dir/5gram.arpa ]; then - ./shared/make_kn_lm.py \ - -ngram-order 5 \ - -text $lang_dir/transcript_tokens.txt \ - -lm $lang_dir/5gram.arpa -fi - -if [ ! -f $lang_dir/5gram.fst.txt ]; then - python3 -m kaldilm \ - --read-symbol-table="$lang_dir/tokens.txt" \ - --disambig-symbol='#0' \ - --max-order=5 \ - $lang_dir/5gram.arpa > $lang_dir/5gram.fst.txt -fi + if [ ! -f $lang_dir/${ngram}gram.fst.txt ]; then + python3 -m kaldilm \ + --read-symbol-table="$lang_dir/tokens.txt" \ + --disambig-symbol='#0' \ + --max-order=${ngram} \ + $lang_dir/${ngram}gram.arpa > $lang_dir/${ngram}gram.fst.txt + fi +done diff --git a/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py index 2d8b54a7e8..c7b53ebc00 100755 --- a/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py +++ b/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py @@ -306,6 +306,22 @@ def get_parser(): fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""", ) + parser.add_argument( + "--tokens-ngram", + type=int, + default=3, + help="""Token Ngram used for rescoring. + Used only when the decoding method is modified_beam_search_ngram_rescoring""", + ) + + parser.add_argument( + "--backoff-id", + type=int, + default=500, + help="""ID of the backoff symbol. + Used only when the decoding method is modified_beam_search_ngram_rescoring""", + ) + add_model_arguments(parser) return parser @@ -790,11 +806,11 @@ def main(): model.to(device) model.eval() - lm_filename = "5ram.fst.txt" + lm_filename = f"{params.tokens_ngram}gram.fst.txt" logging.info(f"lm filename: {lm_filename}") ngram_lm = NgramLm( str(params.lang_dir / lm_filename), - backoff_id=500, + backoff_id=params.backoff_id, is_binary=False, ) logging.info(f"num states: {ngram_lm.lm.num_states}")