Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support padding removing. #444

Merged
merged 2 commits into from
Nov 10, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 39 additions & 3 deletions deep_speech_2/data_utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ class DataGenerator(object):
be passed forward directly without
converting to index sequence.
:type keep_transcription_text: bool
:param num_conv_layers: The number of convolution layer, used to compute
the sequence length.
:type num_conv_layers: int
"""

def __init__(self,
Expand All @@ -74,7 +77,8 @@ def __init__(self,
use_dB_normalization=True,
num_threads=multiprocessing.cpu_count() // 2,
random_seed=0,
keep_transcription_text=False):
keep_transcription_text=False,
num_conv_layers=2):
self._max_duration = max_duration
self._min_duration = min_duration
self._normalizer = FeatureNormalizer(mean_std_filepath)
Expand All @@ -95,6 +99,7 @@ def __init__(self,
self._local_data = local()
self._local_data.tar2info = {}
self._local_data.tar2object = {}
self._num_conv_layers = num_conv_layers

def process_utterance(self, filename, transcript):
"""Load, augment, featurize and normalize for speech data.
Expand Down Expand Up @@ -213,7 +218,15 @@ def feeding(self):
:return: Data feeding dict.
:rtype: dict
"""
return {"audio_spectrogram": 0, "transcript_text": 1}
feeding_dict = {
"audio_spectrogram": 0,
"transcript_text": 1,
"sequence_offset": 2,
"sequence_length": 3
}
for i in xrange(self._num_conv_layers):
feeding_dict["conv%d_index_range" % i] = len(feeding_dict)
return feeding_dict

@property
def vocab_size(self):
Expand Down Expand Up @@ -306,7 +319,30 @@ def _padding_batch(self, batch, padding_to=-1, flatten=False):
padded_audio[:, :audio.shape[1]] = audio
if flatten:
padded_audio = padded_audio.flatten()
new_batch.append((padded_audio, text))

# Stride size for conv0 is (3, 2)
# Stride size for conv1 to convN is (1, 2)
# Same as the network, hard-coded here
padded_instance = [padded_audio, text]
padded_conv0_h = (padded_audio.shape[0] - 1) // 2 + 1
padded_conv0_w = (padded_audio.shape[1] - 1) // 3 + 1
valid_w = (audio.shape[1] - 1) // 3 + 1
padded_instance += [
[0], # sequence offset, always 0
[valid_w], # valid sequence length
# Index ranges for channel, height and width
# Please refer scale_sub_region layer to see details
[1, 32, 1, padded_conv0_h, valid_w + 1, padded_conv0_w]
]
pre_padded_h = padded_conv0_h
for i in xrange(self._num_conv_layers - 1):
padded_h = (pre_padded_h - 1) // 2 + 1
pre_padded_h = padded_h
padded_instance += [
[1, 32, 1, padded_h, valid_w + 1, padded_conv0_w]
]

new_batch.append(padded_instance)
return new_batch

def _batch_shuffle(self, manifest, batch_size, clipped=False):
Expand Down
8 changes: 5 additions & 3 deletions deep_speech_2/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def infer():
augmentation_config='{}',
specgram_type=args.specgram_type,
num_threads=1,
keep_transcription_text=True)
keep_transcription_text=True,
num_conv_layers=args.num_conv_layers)
batch_reader = data_generator.batch_reader_creator(
manifest_path=args.infer_manifest,
batch_size=args.num_samples,
Expand Down Expand Up @@ -100,10 +101,11 @@ def infer():
cutoff_top_n=args.cutoff_top_n,
vocab_list=vocab_list,
language_model_path=args.lang_model_path,
num_processes=args.num_proc_bsearch)
num_processes=args.num_proc_bsearch,
feeding_dict=data_generator.feeding)

error_rate_func = cer if args.error_rate_type == 'cer' else wer
target_transcripts = [transcript for _, transcript in infer_data]
target_transcripts = [data[1] for data in infer_data]
for target, result in zip(target_transcripts, result_transcripts):
print("\nTarget Transcription: %s\nOutput Transcription: %s" %
(target, result))
Expand Down
30 changes: 26 additions & 4 deletions deep_speech_2/model_utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def infer_loss_batch(self, infer_data):

def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta,
beam_size, cutoff_prob, cutoff_top_n, vocab_list,
language_model_path, num_processes):
language_model_path, num_processes, feeding_dict):
"""Model inference. Infer the transcription for a batch of speech
utterances.

Expand Down Expand Up @@ -195,6 +195,9 @@ def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta,
:type language_model_path: basestring|None
:param num_processes: Number of processes (CPU) for decoder.
:type num_processes: int
:param feeding_dict: Feeding is a map of field name and tuple index
of the data that reader returns.
:type feeding_dict: dict|list
:return: List of transcription texts.
:rtype: List of basestring
"""
Expand All @@ -203,10 +206,13 @@ def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta,
self._inferer = paddle.inference.Inference(
output_layer=self._log_probs, parameters=self._parameters)
# run inference
infer_results = self._inferer.infer(input=infer_data)
num_steps = len(infer_results) // len(infer_data)
infer_results = self._inferer.infer(
input=infer_data, feeding=feeding_dict)
start_pos = [0] * (len(infer_data) + 1)
for i in xrange(len(infer_data)):
start_pos[i + 1] = start_pos[i] + infer_data[i][3][0]
probs_split = [
infer_results[i * num_steps:(i + 1) * num_steps]
infer_results[start_pos[i]:start_pos[i + 1]]
for i in xrange(0, len(infer_data))
]
# run decoder
Expand Down Expand Up @@ -274,9 +280,25 @@ def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
text_data = paddle.layer.data(
name="transcript_text",
type=paddle.data_type.integer_value_sequence(vocab_size))
seq_offset_data = paddle.layer.data(
name='sequence_offset',
type=paddle.data_type.integer_value_sequence(1))
seq_len_data = paddle.layer.data(
name='sequence_length',
type=paddle.data_type.integer_value_sequence(1))
index_range_datas = []
for i in xrange(num_rnn_layers):
index_range_datas.append(
paddle.layer.data(
name='conv%d_index_range' % i,
type=paddle.data_type.dense_vector(6)))

self._log_probs, self._loss = deep_speech_v2_network(
audio_data=audio_data,
text_data=text_data,
seq_offset_data=seq_offset_data,
seq_len_data=seq_len_data,
index_range_datas=index_range_datas,
dict_size=vocab_size,
num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers,
Expand Down
42 changes: 35 additions & 7 deletions deep_speech_2/model_utils/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
padding, act):
padding, act, index_range_data):
"""Convolution layer with batch normalization.

:param input: Input layer.
Expand All @@ -24,6 +24,8 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
:type padding: int|tuple|list
:param act: Activation type.
:type act: BaseActivation
:param index_range_data: Index range to indicate sub region.
:type index_range_data: LayerOutput
:return: Batch norm layer after convolution layer.
:rtype: LayerOutput
"""
Expand All @@ -36,7 +38,11 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
padding=padding,
act=paddle.activation.Linear(),
bias_attr=False)
return paddle.layer.batch_norm(input=conv_layer, act=act)
batch_norm = paddle.layer.batch_norm(input=conv_layer, act=act)
# reset padding part to 0
scale_sub_region = paddle.layer.scale_sub_region(
batch_norm, index_range_data, value=0.0)
return scale_sub_region


def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights):
Expand Down Expand Up @@ -136,13 +142,15 @@ def bidirectional_gru_bn_layer(name, input, size, act):
return paddle.layer.concat(input=[forward_gru, backward_gru])


def conv_group(input, num_stacks):
def conv_group(input, num_stacks, index_range_datas):
"""Convolution group with stacked convolution layers.

:param input: Input layer.
:type input: LayerOutput
:param num_stacks: Number of stacked convolution layers.
:type num_stacks: int
:param index_range_datas: Index ranges for each convolution layer.
:type index_range_datas: tuple|list
:return: Output layer of the convolution group.
:rtype: LayerOutput
"""
Expand All @@ -153,7 +161,8 @@ def conv_group(input, num_stacks):
num_channels_out=32,
stride=(3, 2),
padding=(5, 20),
act=paddle.activation.BRelu())
act=paddle.activation.BRelu(),
index_range_data=index_range_datas[0])
for i in xrange(num_stacks - 1):
conv = conv_bn_layer(
input=conv,
Expand All @@ -162,7 +171,8 @@ def conv_group(input, num_stacks):
num_channels_out=32,
stride=(1, 2),
padding=(5, 10),
act=paddle.activation.BRelu())
act=paddle.activation.BRelu(),
index_range_data=index_range_datas[i + 1])
output_num_channels = 32
output_height = 160 // pow(2, num_stacks) + 1
return conv, output_num_channels, output_height
Expand Down Expand Up @@ -207,6 +217,9 @@ def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights):

def deep_speech_v2_network(audio_data,
text_data,
seq_offset_data,
seq_len_data,
index_range_datas,
dict_size,
num_conv_layers=2,
num_rnn_layers=3,
Expand All @@ -219,6 +232,12 @@ def deep_speech_v2_network(audio_data,
:type audio_data: LayerOutput
:param text_data: Transcription text data layer.
:type text_data: LayerOutput
:param seq_offset_data: Sequence offset data layer.
:type seq_offset_data: LayerOutput
:param seq_len_data: Valid sequence length data layer.
:type seq_len_data: LayerOutput
:param index_range_datas: Index ranges data layers.
:type index_range_datas: tuple|list
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
Expand All @@ -239,7 +258,9 @@ def deep_speech_v2_network(audio_data,
"""
# convolution group
conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
input=audio_data, num_stacks=num_conv_layers)
input=audio_data,
num_stacks=num_conv_layers,
index_range_datas=index_range_datas)
# convert data form convolution feature map to sequence of vectors
conv2seq = paddle.layer.block_expand(
input=conv_group_output,
Expand All @@ -248,9 +269,16 @@ def deep_speech_v2_network(audio_data,
stride_y=1,
block_x=1,
block_y=conv_group_height)
# remove padding part
remove_padding_data = paddle.layer.sub_seq(
input=conv2seq,
offsets=seq_offset_data,
sizes=seq_len_data,
act=paddle.activation.Linear(),
bias_attr=False)
# rnn group
rnn_group_output = rnn_group(
input=conv2seq,
input=remove_padding_data,
size=rnn_size,
num_stacks=num_rnn_layers,
use_gru=use_gru,
Expand Down
8 changes: 5 additions & 3 deletions deep_speech_2/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ def evaluate():
augmentation_config='{}',
specgram_type=args.specgram_type,
num_threads=args.num_proc_data,
keep_transcription_text=True)
keep_transcription_text=True,
num_conv_layers=args.num_conv_layers)
batch_reader = data_generator.batch_reader_creator(
manifest_path=args.test_manifest,
batch_size=args.batch_size,
Expand Down Expand Up @@ -103,8 +104,9 @@ def evaluate():
cutoff_top_n=args.cutoff_top_n,
vocab_list=vocab_list,
language_model_path=args.lang_model_path,
num_processes=args.num_proc_bsearch)
target_transcripts = [transcript for _, transcript in infer_data]
num_processes=args.num_proc_bsearch,
feeding_dict=data_generator.feeding)
target_transcripts = [data[1] for data in infer_data]
for target, result in zip(target_transcripts, result_transcripts):
error_sum += error_rate_func(target, result)
num_ins += 1
Expand Down
6 changes: 4 additions & 2 deletions deep_speech_2/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,15 @@ def train():
max_duration=args.max_duration,
min_duration=args.min_duration,
specgram_type=args.specgram_type,
num_threads=args.num_proc_data)
num_threads=args.num_proc_data,
num_conv_layers=args.num_conv_layers)
dev_generator = DataGenerator(
vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_path,
augmentation_config="{}",
specgram_type=args.specgram_type,
num_threads=args.num_proc_data)
num_threads=args.num_proc_data,
num_conv_layers=args.num_conv_layers)
train_batch_reader = train_generator.batch_reader_creator(
manifest_path=args.train_manifest,
batch_size=args.batch_size,
Expand Down