Skip to content

Commit

Permalink
Update RNN-T pipeline tests to include GPU resampling and silence det…
Browse files Browse the repository at this point in the history
…ection (#3920)

Signed-off-by: Joaquin Anton <[email protected]>
  • Loading branch information
jantonguirao authored May 26, 2022
1 parent 01152da commit f783bc1
Showing 1 changed file with 64 additions and 15 deletions.
79 changes: 64 additions & 15 deletions dali/test/python/test_torch_pipeline_rnnt.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -24,6 +24,7 @@
import math
import random
import os
from nose.tools import nottest

audio_files = get_files('db/audio/wav', 'wav')
audio_files = [file for file in audio_files if '237-134500' in file] # Filtering librispeech samples
Expand Down Expand Up @@ -247,7 +248,7 @@ def dali_frame_splicing_graph(x, nfeatures, x_len, stacking=1, subsampling=1):
if stacking > 1:
seq = [x]
for n in range(1, stacking):
f = fn.slice(x, start=n, shape=x_len, axes=(1,), out_of_bounds_policy='pad', fill_values=0)
f = fn.slice(x, n, x_len, axes=(1,), out_of_bounds_policy='pad', fill_values=0)
seq.append(f)
x = fn.cat(*seq, axis=0)
nfeatures = nfeatures * stacking
Expand Down Expand Up @@ -276,10 +277,10 @@ def flip_1d(x):
x = fn.flip(x, vertical=1)
x = fn.reshape(x, shape=(-1,), layout="t")
return x
pad_start = fn.slice(x, start=1, shape=pad_amount, axes=(0,))
pad_start = fn.slice(x, 1, pad_amount, axes=(0,))
pad_start = flip_1d(pad_start)

pad_end = fn.slice(x, start=(x_len-pad_amount-1), shape=pad_amount, axes=(0,))
pad_end = fn.slice(x, x_len-pad_amount-1, pad_amount, axes=(0,))
pad_end = flip_1d(pad_end)
x = fn.cat(pad_start, x, pad_end, axis=0)
return x
Expand All @@ -288,7 +289,9 @@ def flip_1d(x):
def rnnt_train_pipe(files, sample_rate, pad_amount=0, preemph_coeff=.97,
window_size=.02, window_stride=.01, window="hann", nfeatures=64, nfft=512,
frame_splicing_stack=1, frame_splicing_subsample=1,
lowfreq=0.0, highfreq=None, normalize_type='per_feature', device='cpu'):
lowfreq=0.0, highfreq=None, normalize_type='per_feature',
speed_perturb=False, silence_trim=False,
device='cpu'):
assert normalize_type == 'per_feature' or normalize_type == 'all_features'
norm_axes = [1] if normalize_type == 'per_feature' else [0, 1]
win_len, win_hop = win_args(sample_rate, window_size, window_stride)
Expand All @@ -298,37 +301,56 @@ def rnnt_train_pipe(files, sample_rate, pad_amount=0, preemph_coeff=.97,
data, _ = fn.readers.file(files=files, device="cpu", random_shuffle=False, shard_id=0, num_shards=1)
audio, _ = fn.decoders.audio(data, dtype=types.FLOAT, downmix=True)

audio_shape = fn.shapes(audio, dtype=types.INT32)
orig_audio_len = fn.slice(audio_shape, 0, 1, axes=(0,))
# splicing with subsampling doesn't work if audio_len is a GPU data node
if device == 'gpu' and frame_splicing_subsample == 1:
audio = audio.gpu()

if pad_amount > 0:
audio_len = orig_audio_len + 2 * pad_amount
else:
audio_len = orig_audio_len
# Speed perturbation 0.85x - 1.15x
if speed_perturb:
target_sr_factor = fn.random.uniform(device="cpu", range=(1/1.15, 1/0.85))
audio = fn.experimental.audio_resample(audio, scale=target_sr_factor)

spec_len = audio_len // win_hop + 1
# Silence trimming
if silence_trim:
begin, length = fn.nonsilent_region(audio, cutoff_db=-80)
audio = fn.slice(audio, begin, length, axes=[0])

if device == 'gpu':
audio_shape = fn.shapes(audio, dtype=types.INT32)
orig_audio_len = fn.slice(audio_shape, 0, 1, axes=(0,))

# If we couldn't move to GPU earlier, do it now
if device == 'gpu' and frame_splicing_subsample > 1:
audio = audio.gpu()

if pad_amount > 0:
audio_len = orig_audio_len + 2 * pad_amount
padded_audio = dali_reflect_pad_graph(audio, orig_audio_len, pad_amount)
else:
audio_len = orig_audio_len
padded_audio = audio

# Preemphasis filter
preemph_audio = fn.preemphasis_filter(padded_audio, preemph_coeff=preemph_coeff, border='zero')

# Spectrogram
spec_len = audio_len // win_hop + 1
spec = fn.spectrogram(preemph_audio, nfft=nfft, window_fn=window_fn_arg, window_length=win_len, window_step=win_hop,
center_windows=True, reflect_padding=True)
# Mel spectrogram
mel_spec = fn.mel_filter_bank(spec, sample_rate=sample_rate, nfilter=nfeatures, freq_low=lowfreq, freq_high=highfreq)

# Log
log_features = fn.to_decibels(mel_spec + 1e-20, multiplier=np.log(10), reference=1.0, cutoff_db=-80)

# Frame splicing
if frame_splicing_stack > 1 or frame_splicing_subsample > 1:
log_features_spliced = dali_frame_splicing_graph(log_features, nfeatures, spec_len,
stacking=frame_splicing_stack,
subsampling=frame_splicing_subsample)
else:
log_features_spliced = log_features

# Normalization
if normalize_type:
norm_log_features = fn.normalize(log_features_spliced, axes=norm_axes, device=device, epsilon=4e-5, ddof=1)
else:
Expand All @@ -352,6 +374,8 @@ def _testimpl_rnnt_data_pipeline(device, pad_amount=0, preemph_coeff=.97, window
window="hann", nfeatures=64, n_fft=512, frame_splicing_stack=1, frame_splicing_subsample=1,
lowfreq=0.0, highfreq=None, normalize_type='per_feature', batch_size=32):
sample_rate = npy_files_sr
speed_perturb = False
silence_trim = False

ref_pipeline = FilterbankFeatures(
sample_rate=sample_rate, window_size=window_size, window_stride=window_stride, window=window, normalize=normalize_type,
Expand All @@ -369,8 +393,8 @@ def _testimpl_rnnt_data_pipeline(device, pad_amount=0, preemph_coeff=.97, window

pipe = rnnt_train_pipe(
audio_files, sample_rate, pad_amount, preemph_coeff, window_size, window_stride, window, nfeatures,
n_fft, frame_splicing_stack, frame_splicing_subsample, lowfreq, highfreq, normalize_type, device,
seed=42, batch_size=batch_size
n_fft, frame_splicing_stack, frame_splicing_subsample, lowfreq, highfreq, normalize_type,
speed_perturb, silence_trim, device, seed=42, batch_size=batch_size
)
pipe.build()
nbatches = (nrecordings + batch_size - 1) // batch_size
Expand Down Expand Up @@ -445,3 +469,28 @@ def test_rnnt_data_pipeline():
yield _testimpl_rnnt_data_pipeline, device, \
pad_amount, preemph_coeff, window_size, window_stride, window, nfeatures, n_fft, \
frame_splicing_stack, frame_splicing_subsample, lowfreq, highfreq, normalize_type

@nottest # To be run manually to check perf
def test_rnnt_data_pipeline_throughput(pad_amount=0, preemph_coeff=.97, window_size=.02, window_stride=.01,
window="hann", nfeatures=64, n_fft=512, frame_splicing_stack=1, frame_splicing_subsample=1,
speed_perturb=True, silence_trim=True, lowfreq=0.0, highfreq=None, normalize_type='per_feature', batch_size=32):
sample_rate = npy_files_sr
device = 'gpu'
pipe = rnnt_train_pipe(
audio_files, sample_rate, pad_amount, preemph_coeff, window_size, window_stride, window, nfeatures,
n_fft, frame_splicing_stack, frame_splicing_subsample, lowfreq, highfreq,
normalize_type, speed_perturb, silence_trim, device, seed=42, batch_size=batch_size
)
pipe.build()

import time
from test_utils import AverageMeter
end = time.time()
data_time = AverageMeter()
iters = 1000
for j in range(iters):
pipe.run()
data_time.update(time.time() - end)
if j % 100 == 0:
print(f"run {j+1}/ {iters}, avg time: {data_time.avg} [s], worst time: {data_time.max_val} [s], speed: {batch_size / data_time.avg} [recordings/s]")
end = time.time()

0 comments on commit f783bc1

Please sign in to comment.