diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index cc9cb34baf..892a58fb21 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -556,9 +556,9 @@ Number of model parameters 118129516 (i.e, 118.13 M). | | test-clean | test-other | comment | |-------------------------------------|------------|------------|----------------------------------------| -| greedy search (max sym per frame 1) | 2.39 | 5.57 | --epoch 39 --avg 7 --max-duration 600 | -| modified beam search | 2.35 | 5.50 | --epoch 39 --avg 7 --max-duration 600 | -| fast beam search | 2.38 | 5.50 | --epoch 39 --avg 7 --max-duration 600 | +| greedy search (max sym per frame 1) | 2.43 | 5.72 | --epoch 30 --avg 10 --max-duration 600 | +| modified beam search | 2.43 | 5.69 | --epoch 30 --avg 10 --max-duration 600 | +| fast beam search | 2.43 | 5.67 | --epoch 30 --avg 10 --max-duration 600 | The training commands are: @@ -567,8 +567,8 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" ./pruned_transducer_stateless5/train.py \ --world-size 8 \ - --num-epochs 40 \ - --start-epoch 0 \ + --num-epochs 30 \ + --start-epoch 1 \ --full-libri 1 \ --exp-dir pruned_transducer_stateless5/exp-L \ --max-duration 300 \ @@ -582,15 +582,15 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" ``` The tensorboard log can be found at - + The decoding commands are: ```bash for method in greedy_search modified_beam_search fast_beam_search; do ./pruned_transducer_stateless5/decode.py \ - --epoch 39 \ - --avg 7 \ + --epoch 30 \ + --avg 10 \ --exp-dir ./pruned_transducer_stateless5/exp-L \ --max-duration 600 \ --decoding-method $method \ @@ -600,13 +600,14 @@ for method in greedy_search modified_beam_search fast_beam_search; do --nhead 8 \ --encoder-dim 512 \ --decoder-dim 512 \ - --joiner-dim 512 + --joiner-dim 512 \ + --use-averaged-model True done ``` You can find a pretrained model, training logs, decoding logs, and decoding results at: - + #### Medium @@ -615,9 +616,9 @@ Number of model parameters 30896748 (i.e, 30.9 M). | | test-clean | test-other | comment | |-------------------------------------|------------|------------|-----------------------------------------| -| greedy search (max sym per frame 1) | 2.88 | 6.69 | --epoch 39 --avg 17 --max-duration 600 | -| modified beam search | 2.83 | 6.59 | --epoch 39 --avg 17 --max-duration 600 | -| fast beam search | 2.83 | 6.61 | --epoch 39 --avg 17 --max-duration 600 | +| greedy search (max sym per frame 1) | 2.87 | 6.92 | --epoch 30 --avg 10 --max-duration 600 | +| modified beam search | 2.83 | 6.75 | --epoch 30 --avg 10 --max-duration 600 | +| fast beam search | 2.81 | 6.76 | --epoch 30 --avg 10 --max-duration 600 | The training commands are: @@ -626,8 +627,8 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" ./pruned_transducer_stateless5/train.py \ --world-size 8 \ - --num-epochs 40 \ - --start-epoch 0 \ + --num-epochs 30 \ + --start-epoch 1 \ --full-libri 1 \ --exp-dir pruned_transducer_stateless5/exp-M \ --max-duration 300 \ @@ -641,15 +642,15 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" ``` The tensorboard log can be found at - + The decoding commands are: ```bash for method in greedy_search modified_beam_search fast_beam_search; do ./pruned_transducer_stateless5/decode.py \ - --epoch 39 \ - --avg 17 \ + --epoch 30 \ + --avg 10 \ --exp-dir ./pruned_transducer_stateless5/exp-M \ --max-duration 600 \ --decoding-method $method \ @@ -659,13 +660,14 @@ for method in greedy_search modified_beam_search fast_beam_search; do --nhead 4 \ --encoder-dim 256 \ --decoder-dim 512 \ - --joiner-dim 512 + --joiner-dim 512 \ + --use-averaged-model True done ``` You can find a pretrained model, training logs, decoding logs, and decoding results at: - + #### Baseline-2 @@ -675,19 +677,19 @@ layers (24 v.s 12) but a narrower model (1536 feedforward dim and 384 encoder di | | test-clean | test-other | comment | |-------------------------------------|------------|------------|-----------------------------------------| -| greedy search (max sym per frame 1) | 2.41 | 5.70 | --epoch 31 --avg 17 --max-duration 600 | -| modified beam search | 2.41 | 5.69 | --epoch 31 --avg 17 --max-duration 600 | -| fast beam search | 2.41 | 5.69 | --epoch 31 --avg 17 --max-duration 600 | +| greedy search (max sym per frame 1) | 2.54 | 5.72 | --epoch 30 --avg 10 --max-duration 600 | +| modified beam search | 2.47 | 5.71 | --epoch 30 --avg 10 --max-duration 600 | +| fast beam search | 2.5 | 5.72 | --epoch 30 --avg 10 --max-duration 600 | ```bash export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" ./pruned_transducer_stateless5/train.py \ --world-size 8 \ - --num-epochs 40 \ - --start-epoch 0 \ + --num-epochs 30 \ + --start-epoch 1 \ --full-libri 1 \ - --exp-dir pruned_transducer_stateless5/exp \ + --exp-dir pruned_transducer_stateless5/exp-B \ --max-duration 300 \ --use-fp16 0 \ --num-encoder-layers 24 \ @@ -699,19 +701,16 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" ``` The tensorboard log can be found at - - -**Caution**: The training script is updated so that epochs are counted from 1 -after the training. + The decoding commands are: ```bash for method in greedy_search modified_beam_search fast_beam_search; do ./pruned_transducer_stateless5/decode.py \ - --epoch 31 \ - --avg 17 \ - --exp-dir ./pruned_transducer_stateless5/exp-M \ + --epoch 30 \ + --avg 10 \ + --exp-dir ./pruned_transducer_stateless5/exp-B \ --max-duration 600 \ --decoding-method $method \ --max-sym-per-frame 1 \ @@ -720,13 +719,14 @@ for method in greedy_search modified_beam_search fast_beam_search; do --nhead 8 \ --encoder-dim 384 \ --decoder-dim 512 \ - --joiner-dim 512 + --joiner-dim 512 \ + --use-averaged-model True done ``` You can find a pretrained model, training logs, decoding logs, and decoding results at: - + ### LibriSpeech BPE training results (Pruned Stateless Transducer 4) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py index 49bc6a489a..cb56bdffc0 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless5/conformer.py @@ -1064,10 +1064,6 @@ class RandomCombine(nn.Module): is a random combination of all the inputs; but which in test time will be just the last input. - All but the last input will have a linear transform before we - randomly combine them; these linear transforms will be initialized - to the identity transform. - The idea is that the list of Tensors will be a list of outputs of multiple conformer layers. This has a similar effect as iterated loss. (See: DEJA-VU: DOUBLE FEATURE PRESENTATION AND ITERATED LOSS IN DEEP TRANSFORMER @@ -1267,7 +1263,6 @@ def _test_random_combine(final_weight: float, pure_prob: float, stddev: float): num_channels = 50 m = RandomCombine( num_inputs=num_inputs, - num_channels=num_channels, final_weight=final_weight, pure_prob=pure_prob, stddev=stddev, @@ -1289,9 +1284,7 @@ def _test_random_combine_main(): _test_random_combine(0.5, 0.5, 0.3) feature_dim = 50 - c = Conformer( - num_features=feature_dim, output_dim=256, d_model=128, nhead=4 - ) + c = Conformer(num_features=feature_dim, d_model=128, nhead=4) batch_size = 5 seq_len = 20 # Just make sure the forward pass runs.