forked from SeanNaren/deepspeech.pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
113 lines (97 loc) · 5.49 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import argparse
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm
from decoder import GreedyDecoder
from data.data_loader import SpectrogramDataset, AudioDataLoader
from model import DeepSpeech
parser = argparse.ArgumentParser(description='DeepSpeech transcription')
parser.add_argument('--model-path', default='models/deepspeech_final.pth',
help='Path to model file created by training')
parser.add_argument('--cuda', action="store_true", help='Use cuda to test model')
parser.add_argument('--test-manifest', metavar='DIR',
help='path to validation manifest csv', default='data/test_manifest.csv')
parser.add_argument('--batch-size', default=20, type=int, help='Batch size for training')
parser.add_argument('--num-workers', default=4, type=int, help='Number of workers used in dataloading')
parser.add_argument('--decoder', default="greedy", choices=["greedy", "beam", "none"], type=str, help="Decoder to use")
parser.add_argument('--verbose', action="store_true", help="print out decoded output and error of each sample")
no_decoder_args = parser.add_argument_group("No Decoder Options", "Configuration options for when no decoder is "
"specified")
no_decoder_args.add_argument('--output-path', default=None, type=str, help="Where to save raw acoustic output")
beam_args = parser.add_argument_group("Beam Decode Options", "Configurations options for the CTC Beam Search decoder")
beam_args.add_argument('--top-paths', default=1, type=int, help='number of beams to return')
beam_args.add_argument('--beam-width', default=10, type=int, help='Beam width to use')
beam_args.add_argument('--lm-path', default=None, type=str,
help='Path to an (optional) kenlm language model for use with beam search (req\'d with trie)')
beam_args.add_argument('--alpha', default=0.8, type=float, help='Language model weight')
beam_args.add_argument('--beta', default=1, type=float, help='Language model word bonus (all words)')
beam_args.add_argument('--cutoff-top-n', default=40, type=int,
help='Cutoff number in pruning, only top cutoff_top_n characters with highest probs in '
'vocabulary will be used in beam search, default 40.')
beam_args.add_argument('--cutoff-prob', default=1.0, type=float,
help='Cutoff probability in pruning,default 1.0, no pruning.')
beam_args.add_argument('--lm-workers', default=1, type=int, help='Number of LM processes to use')
args = parser.parse_args()
if __name__ == '__main__':
model = DeepSpeech.load_model(args.model_path, cuda=args.cuda)
model.eval()
labels = DeepSpeech.get_labels(model)
audio_conf = DeepSpeech.get_audio_conf(model)
if args.decoder == "beam":
from decoder import BeamCTCDecoder
decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
beam_width=args.beam_width, num_processes=args.lm_workers)
elif args.decoder == "greedy":
decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
else:
decoder = None
target_decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, labels=labels,
normalize=True)
test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size,
num_workers=args.num_workers)
total_cer, total_wer = 0, 0
output_data = []
for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)):
inputs, targets, input_percentages, target_sizes = data
inputs = Variable(inputs, volatile=True)
# unflatten targets
split_targets = []
offset = 0
for size in target_sizes:
split_targets.append(targets[offset:offset + size])
offset += size
if args.cuda:
inputs = inputs.cuda()
out = model(inputs)
out = out.transpose(0, 1) # TxNxH
seq_length = out.size(0)
sizes = input_percentages.mul_(int(seq_length)).int()
if decoder is None:
# add output to data array, and continue
output_data.append((out.data.cpu().numpy(), sizes.numpy()))
continue
decoded_output, _, = decoder.decode(out.data, sizes)
target_strings = target_decoder.convert_to_strings(split_targets)
wer, cer = 0, 0
for x in range(len(target_strings)):
transcript, reference = decoded_output[x][0], target_strings[x][0]
wer_inst = decoder.wer(transcript, reference) / float(len(reference.split()))
cer_inst = decoder.cer(transcript, reference) / float(len(reference))
wer += wer_inst
cer += cer_inst
if args.verbose:
print("Ref:", reference.lower())
print("Hyp:", transcript.lower())
print("WER:", wer_inst, "CER:", cer_inst, "\n")
total_cer += cer
total_wer += wer
if decoder is not None:
wer = total_wer / len(test_loader.dataset)
cer = total_cer / len(test_loader.dataset)
print('Test Summary \t'
'Average WER {wer:.3f}\t'
'Average CER {cer:.3f}\t'.format(wer=wer * 100, cer=cer * 100))
else:
np.save(args.output_path, output_data)