From 059a7549aebb3318a19b398d006b59cffb465a83 Mon Sep 17 00:00:00 2001 From: Zixuan Cheng <110808245+violetch24@users.noreply.github.com> Date: Tue, 19 Sep 2023 13:55:13 +0800 Subject: [PATCH] migrate rnnt to API 2.x (#1097) Signed-off-by: Cheng, Zixuan --- examples/.config/model_params_pytorch.json | 8 +- examples/README.md | 6 + .../rnnt/quantization/ptq_dynamic/fx/QSL.py | 68 +++++ .../quantization/ptq_dynamic/fx/README.md | 75 +++++ .../ptq_dynamic/fx/accuracy_eval.py | 51 ++++ .../quantization/ptq_dynamic/fx/mlperf.conf | 65 +++++ .../ptq_dynamic/fx/prepare_dataset.sh | 55 ++++ .../ptq_dynamic/fx/prepare_loadgen.sh | 10 + .../ptq_dynamic/fx/pytorch/Dockerfile | 46 ++++ .../ptq_dynamic/fx/pytorch/LICENSE | 204 ++++++++++++++ .../ptq_dynamic/fx/pytorch/configs/rnnt.toml | 77 ++++++ .../ptq_dynamic/fx/pytorch/dataset.py | 159 +++++++++++ .../ptq_dynamic/fx/pytorch/decoders.py | 121 ++++++++ .../ptq_dynamic/fx/pytorch/helpers.py | 123 +++++++++ .../ptq_dynamic/fx/pytorch/metrics.py | 67 +++++ .../fx/pytorch/model_separable_rnnt.py | 214 ++++++++++++++ .../ptq_dynamic/fx/pytorch/parts/features.py | 260 ++++++++++++++++++ .../ptq_dynamic/fx/pytorch/parts/manifest.py | 176 ++++++++++++ .../ptq_dynamic/fx/pytorch/parts/segment.py | 170 ++++++++++++ .../ptq_dynamic/fx/pytorch/parts/text/LICENSE | 19 ++ .../fx/pytorch/parts/text/__init__.py | 12 + .../fx/pytorch/parts/text/cleaners.py | 116 ++++++++ .../fx/pytorch/parts/text/numbers.py | 101 +++++++ .../ptq_dynamic/fx/pytorch/preprocessing.py | 39 +++ .../ptq_dynamic/fx/pytorch/rnn.py | 110 ++++++++ .../fx/pytorch/scripts/docker/build.sh | 3 + .../fx/pytorch/scripts/docker/launch.sh | 32 +++ .../pytorch/scripts/download_librispeech.sh | 28 ++ .../fx/pytorch/scripts/evaluation.sh | 92 +++++++ .../fx/pytorch/scripts/inference.sh | 104 +++++++ .../fx/pytorch/scripts/inference_benchmark.sh | 84 ++++++ .../pytorch/scripts/preprocess_librispeech.sh | 51 ++++ .../ptq_dynamic/fx/pytorch/scripts/train.sh | 113 ++++++++ .../fx/pytorch/scripts/train_benchmark.sh | 130 +++++++++ .../ptq_dynamic/fx/pytorch/utils/__init__.py | 0 .../fx/pytorch/utils/convert_librispeech.py | 82 ++++++ .../fx/pytorch/utils/download_librispeech.py | 76 +++++ .../fx/pytorch/utils/download_utils.py | 69 +++++ .../fx/pytorch/utils/preprocessing_utils.py | 77 ++++++ .../ptq_dynamic/fx/pytorch_SUT.py | 104 +++++++ .../ptq_dynamic/fx/requirements.txt | 10 + .../rnnt/quantization/ptq_dynamic/fx/run.sh | 90 ++++++ .../ptq_dynamic/fx/run_benchmark.sh | 84 ++++++ .../quantization/ptq_dynamic/fx/run_quant.sh | 59 ++++ .../quantization/ptq_dynamic/fx/run_tune.py | 151 ++++++++++ .../quantization/ptq_dynamic/fx/user.conf | 6 + 46 files changed, 3792 insertions(+), 5 deletions(-) create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/QSL.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/README.md create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/accuracy_eval.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/mlperf.conf create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/prepare_dataset.sh create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/prepare_loadgen.sh create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/Dockerfile create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/LICENSE create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/configs/rnnt.toml create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/dataset.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/decoders.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/helpers.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/metrics.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/model_separable_rnnt.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/features.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/manifest.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/segment.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/LICENSE create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/__init__.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/cleaners.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/numbers.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/preprocessing.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/rnn.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/docker/build.sh create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/docker/launch.sh create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/download_librispeech.sh create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/evaluation.sh create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/inference.sh create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/inference_benchmark.sh create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/preprocess_librispeech.sh create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/train.sh create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/train_benchmark.sh create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/__init__.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/convert_librispeech.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/download_librispeech.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/download_utils.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/preprocessing_utils.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch_SUT.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/requirements.txt create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run.sh create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run_benchmark.sh create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run_quant.sh create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run_tune.py create mode 100644 examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/user.conf diff --git a/examples/.config/model_params_pytorch.json b/examples/.config/model_params_pytorch.json index 276ea3c069a..23ca2597e3b 100644 --- a/examples/.config/model_params_pytorch.json +++ b/examples/.config/model_params_pytorch.json @@ -317,13 +317,11 @@ "main_script": "run.py" }, "rnnt": { - "model_src_dir": "speech_recognition/rnnt/quantization/ptq_dynamic/eager", + "model_src_dir": "speech_recognition/rnnt/quantization/ptq_dynamic/fx", "dataset_location": "/tf_dataset/pytorch/rnnt/convert_dataset/", "input_model": "/tf_dataset/pytorch/rnnt/rnnt.pt", - "yaml": "conf.yaml", - "strategy": "basic", - "batch_size": 100, - "new_benchmark": false + "main_script": "run_tune.py", + "batch_size": 100 }, "wav2vec2_dynamic":{ "model_src_dir": "speech_recognition/torchaudio_models/quantization/ptq_dynamic/fx", diff --git a/examples/README.md b/examples/README.md index 9cbe79a132c..0bb4e4177a5 100644 --- a/examples/README.md +++ b/examples/README.md @@ -508,6 +508,12 @@ Intel® Neural Compressor validated examples with multiple compression technique Post-Training Dynamic Quantization fx + + RNNT + Speech Recognition + Post-Training Dynamic Quantization + fx + BlendCNN Natural Language Processing diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/QSL.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/QSL.py new file mode 100644 index 00000000000..9c0abe4e734 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/QSL.py @@ -0,0 +1,68 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.getcwd(), "pytorch")) + +from parts.manifest import Manifest +from parts.segment import AudioSegment + +import numpy as np + +import mlperf_loadgen as lg + + +class AudioQSL: + def __init__(self, dataset_dir, manifest_filepath, labels, + sample_rate=16000, perf_count=None): + m_paths = [manifest_filepath] + self.manifest = Manifest(dataset_dir, m_paths, labels, len(labels), + normalize=True, max_duration=15.0) + self.sample_rate = sample_rate + self.count = len(self.manifest) + perf_count = self.count if perf_count is None else perf_count + self.sample_id_to_sample = {} + self.qsl = lg.ConstructQSL(self.count, perf_count, + self.load_query_samples, + self.unload_query_samples) + print( + "Dataset loaded with {0:.2f} hours. Filtered {1:.2f} hours. Number of samples: {2}".format( + self.manifest.duration / 3600, + self.manifest.filtered_duration / 3600, + self.count)) + + def load_query_samples(self, sample_list): + for sample_id in sample_list: + self.sample_id_to_sample[sample_id] = self._load_sample(sample_id) + + def unload_query_samples(self, sample_list): + for sample_id in sample_list: + del self.sample_id_to_sample[sample_id] + + def _load_sample(self, index): + sample = self.manifest[index] + segment = AudioSegment.from_file(sample['audio_filepath'][0], + target_sr=self.sample_rate) + waveform = segment.samples + assert isinstance(waveform, np.ndarray) and waveform.dtype == np.float32 + return waveform + + def __getitem__(self, index): + return self.sample_id_to_sample[index] + + def __del__(self): + lg.DestroyQSL(self.qsl) + print("Finished destroying QSL.") + +# We have no problem fitting all data in memory, so we do that, in +# order to speed up execution of the benchmark. +class AudioQSLInMemory(AudioQSL): + def __init__(self, dataset_dir, manifest_filepath, labels, + sample_rate=16000, perf_count=None): + super().__init__(dataset_dir, manifest_filepath, labels, + sample_rate, perf_count) + super().load_query_samples(range(self.count)) + + def load_query_samples(self, sample_list): + pass + + def unload_query_samples(self, sample_list): + pass diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/README.md b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/README.md new file mode 100644 index 00000000000..9cc32ad3df5 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/README.md @@ -0,0 +1,75 @@ +Step-by-Step +============ + +This document lists steps of reproducing Intel Optimized PyTorch RNNT models tuning results via Neural Compressor. + +Our example comes from MLPerf Inference Benchmark Suite. + + +# Prerequisite + +## 1. Environment + Python 3.6 or higher version is recommended. + + ```shell + cd examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx + pip install -r requirements.txt + ``` + Check your gcc version with the command: **gcc -v** + + GCC5 or above is required. + + ```shell + # install mlperf + bash prepare_loadgen.sh + ``` + +## 2. Prepare Dataset + + ```shell + bash prepare_dataset.sh --download_dir=origin_dataset --convert_dir=convert_dataset + ``` + + prepare_dataset.sh contains two stages: + - stage1: download LibriSpeech/dev-clean dataset and extract it. + - stage2: convert .flac file to .wav file + +## 3. Prepare Pre-trained Model + + ```shell + wget https://zenodo.org/record/3662521/files/DistributedDataParallel_1576581068.9962234-epoch-100.pt?download=1 -O rnnt.pt + ``` + +# Run + +## 1. Enable RNNT example with the auto dynamic quantization strategy of Neural Compressor. + + The changes made are as follows: + 1. pytorch_SUT.py: + Removed jit script conversion. + 2. pytorch/decoders.py: + Removed assertion of torch.jit.ScriptModule. + +## 2. Tuning command: +```shell +bash run_tuning.sh --dataset_location=convert_dataset --input_model=./rnnt.pt --output_model=saved_results +``` +## 3. Benchmark command: +```shell +# fp32 +bash run_benchmark.sh --dataset_location=convert_dataset --input_model=./rnnt.pt --mode=performance/accuracy --int8=false +# int8 +bash run_benchmark.sh --dataset_location=convert_dataset --input_model=./rnnt.pt --mode=performance/accuracy --int8=true +``` +## 4. Brief output information: + +The first part is accuracy/percentage, right part is time_usage/second. + + - FP32 baseline is: [92.5477, 796.7552]. + - Tune 1 result is: [91.5872, 1202.2529] + - Tune 2 result is: [91.5894, 1201.3231] + - Tune 3 result is: [91.5195, 1211.5965] + - Tune 4 result is: [91.6030, 1218.2211] + - Tune 5 result is: [91.4812, 1169.5080] + - ... + diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/accuracy_eval.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/accuracy_eval.py new file mode 100644 index 00000000000..4341900c536 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/accuracy_eval.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +import argparse +import array +import json +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "pytorch")) + +from helpers import process_evaluation_epoch, __gather_predictions +from parts.manifest import Manifest + +dtype_map = { + "int8": 'b', + "int16": 'h', + "int32": 'l', + "int64": 'q', +} + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--log_dir", required=True) + parser.add_argument("--dataset_dir", required=True) + parser.add_argument("--manifest", required=True) + parser.add_argument("--output_dtype", default="int64", choices=dtype_map.keys(), help="Output data type") + args = parser.parse_args() + return args + +def main(): + args = get_args() + labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] + manifest = Manifest(args.dataset_dir, [args.manifest], labels, len(labels), normalize=True, max_duration=15.0) + with open(os.path.join(args.log_dir, "mlperf_log_accuracy.json")) as fh: + results = json.load(fh) + hypotheses = [] + references = [] + for result in results: + hypotheses.append(array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist()) + references.append(manifest[result["qsl_idx"]]["transcript"]) + + references = __gather_predictions([references], labels=labels) + hypotheses = __gather_predictions([hypotheses], labels=labels) + + d = dict(predictions=hypotheses, + transcripts=references) + wer = process_evaluation_epoch(d) + print("Word Error Rate: {:}%, accuracy={:}%".format(wer * 100, (1 - wer) * 100)) + +if __name__ == '__main__': + main() diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/mlperf.conf b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/mlperf.conf new file mode 100644 index 00000000000..7f5b55b58e2 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/mlperf.conf @@ -0,0 +1,65 @@ +# The format of this config file is 'key = value'. +# The key has the format 'model.scenario.key'. Value is mostly int64_t. +# Model maybe '*' as wildcard. In that case the value applies to all models. +# All times are in milli seconds + +# Set performance_sample_count for each model. +# User can optionally set this to higher values in user.conf. +mobilenet.*.performance_sample_count_override = 1024 +gnmt.*.performance_sample_count_override = 3903900 +resnet50.*.performance_sample_count_override = 1024 +ssd-mobilenet.*.performance_sample_count_override = 256 +ssd-resnet34.*.performance_sample_count_override = 64 +bert.*.performance_sample_count_override = 10833 +dlrm.*.performance_sample_count_override = 204800 +rnnt.*.performance_sample_count_override = 2513 +3d-unet.*.performance_sample_count_override = 16 + +# Set seeds. The seeds will be distributed two weeks before the submission. +*.*.qsl_rng_seed = 12786827339337101903 +*.*.sample_index_rng_seed = 12640797754436136668 +*.*.schedule_rng_seed = 3135815929913719677 + +*.SingleStream.target_latency_percentile = 90 +*.SingleStream.min_duration = 60000 +*.SingleStream.min_query_count = 1024 + +*.MultiStream.target_qps = 20 +*.MultiStream.target_latency_percentile = 99 +*.MultiStream.max_async_queries = 1 +*.MultiStream.target_latency = 50 +*.MultiStream.min_duration = 60000 +*.MultiStream.min_query_count = 270336 +ssd-resnet34.MultiStream.target_qps = 15 +ssd-resnet34.MultiStream.target_latency = 66 +gnmt.MultiStream.min_query_count = 90112 +gnmt.MultiStream.target_latency = 100 +gnmt.MultiStream.target_qps = 10 +gnmt.MultiStream.target_latency_percentile = 97 + +*.Server.target_latency = 10 +*.Server.target_latency_percentile = 99 +*.Server.target_duration = 0 +*.Server.min_duration = 60000 +*.Server.min_query_count = 270336 +resnet50.Server.target_latency = 15 +ssd-resnet34.Server.target_latency = 100 +gnmt.Server.min_query_count = 90112 +gnmt.Server.target_latency = 250 +gnmt.Server.target_latency_percentile = 97 +bert.Server.target_latency = 130 +dlrm.Server.target_latency = 30 +rnnt.Server.target_latency = 1000 + +*.Offline.target_latency_percentile = 90 +*.Offline.min_duration = 60000 +# In Offline scenario, we always have one query. But LoadGen maps this to +# min_sample_count internally in Offline scenario, so set this to 24576 since +# the rule requires that Offline scenario run for at least 24576 samples. +*.Offline.min_query_count = 24576 + +# These fields should be defined and overridden by user.conf. +*.SingleStream.target_latency = 10 +*.Server.target_qps = 1.0 +*.Offline.target_qps = 1.0 +*.MultiStream.samples_per_query = 4 diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/prepare_dataset.sh b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/prepare_dataset.sh new file mode 100644 index 00000000000..2c517ee70ba --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/prepare_dataset.sh @@ -0,0 +1,55 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + prepare_dataset + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --download_dir=*) + download_dir=$(echo $var |cut -f2 -d=) + ;; + --convert_dir=*) + convert_dir=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + + mkdir -p $download_dir $convert_dir +} + +# prepare_dataset +function prepare_dataset { + # if you already have origin dataset, set stage=2, make sure to extract it \ + # and change the origin dataset path to your path + stage=1 + + # Download dataset + if [[ $stage -le 1 ]]; then + python pytorch/utils/download_librispeech.py \ + pytorch/utils/librispeech-inference.csv \ + $download_dir \ + -e $download_dir + fi + + # Convert dataset + if [[ $stage -le 2 ]]; then + python pytorch/utils/convert_librispeech.py \ + --input_dir $download_dir/LibriSpeech/dev-clean \ + --dest_dir $convert_dir/dev-clean-wav \ + --output_json $convert_dir/dev-clean-wav.json + fi +} + +main "$@" \ No newline at end of file diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/prepare_loadgen.sh b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/prepare_loadgen.sh new file mode 100644 index 00000000000..e30eea60442 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/prepare_loadgen.sh @@ -0,0 +1,10 @@ +pushd . +echo "Install loadgen" +git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference +cd mlperf_inference +git checkout r2.1 +git log -1 +git submodule update --init --recursive +cd loadgen +CFLAGS="-std=c++14" python setup.py install +popd diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/Dockerfile b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/Dockerfile new file mode 100644 index 00000000000..1cb52bf6261 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/Dockerfile @@ -0,0 +1,46 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.09-py3 +FROM ${FROM_IMAGE_NAME} + + +RUN apt-get update && apt-get install -y libsndfile1 && apt-get install -y sox && rm -rf /var/lib/apt/lists/* + +RUN COMMIT_SHA=c6d12f9e1562833c2b4e7ad84cb22aa4ba31d18c && \ + git clone https://github.com/HawkAaron/warp-transducer deps/warp-transducer && \ + cd deps/warp-transducer && \ + git checkout $COMMIT_SHA && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make VERBOSE=1 && \ + export CUDA_HOME="/usr/local/cuda" && \ + export WARP_RNNT_PATH=`pwd` && \ + export CUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME && \ + export LD_LIBRARY_PATH="$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH" && \ + export LIBRARY_PATH=$CUDA_HOME/lib64:$LIBRARY_PATH && \ + export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH && \ + export CFLAGS="-I$CUDA_HOME/include $CFLAGS" && \ + cd ../pytorch_binding && \ + python3 setup.py install --user && \ + rm -rf ../tests test ../tensorflow_binding && \ + cd ../../.. + +WORKDIR /workspace/jasper + +COPY requirements.txt . +RUN pip install --disable-pip-version-check -U -r requirements.txt + +COPY . . diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/LICENSE b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/LICENSE new file mode 100644 index 00000000000..75ee157cd96 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/LICENSE @@ -0,0 +1,204 @@ + Except where otherwise noted, the following license applies to all files in this repo. + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 NVIDIA Corporation + Copyright 2019 Myrtle Software Limited, www.myrtle.ai + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/configs/rnnt.toml b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/configs/rnnt.toml new file mode 100644 index 00000000000..a4cd1dfb470 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/configs/rnnt.toml @@ -0,0 +1,77 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model = "RNNT" + +[input] +normalize = "per_feature" +sample_rate = 16000 +window_size = 0.02 +window_stride = 0.01 +window = "hann" +features = 80 +n_fft = 512 +frame_splicing = 3 +dither = 0.00001 +feat_type = "logfbank" +normalize_transcripts = true +trim_silence = true +pad_to = 0 # TODO +max_duration = 16.7 +speed_perturbation = true + + +cutout_rect_regions = 0 +cutout_rect_time = 60 +cutout_rect_freq = 25 + + +cutout_x_regions = 2 +cutout_y_regions = 2 +cutout_x_width = 6 +cutout_y_width = 6 + + +[input_eval] +normalize = "per_feature" +sample_rate = 16000 +window_size = 0.02 +window_stride = 0.01 +window = "hann" +features = 80 +n_fft = 512 +frame_splicing = 3 +dither = 0.00001 +feat_type = "logfbank" +normalize_transcripts = true +trim_silence = true +pad_to = 0 + + +[rnnt] +rnn_type = "lstm" +encoder_n_hidden = 1024 +encoder_pre_rnn_layers = 2 +encoder_stack_time_factor = 2 +encoder_post_rnn_layers = 3 +pred_n_hidden = 320 +pred_rnn_layers = 2 +forget_gate_bias = 1.0 +joint_n_hidden = 512 +dropout=0.32 + + +[labels] +labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/dataset.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/dataset.py new file mode 100644 index 00000000000..7b9036f1c55 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/dataset.py @@ -0,0 +1,159 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This file contains classes and functions related to data loading +""" +from collections import namedtuple +import torch +import numpy as np +from torch.utils.data import Dataset +from parts.manifest import Manifest +from parts.features import WaveformFeaturizer + + +def seq_collate_fn(batch): + """batches samples and returns as tensors + Args: + batch : list of samples + Returns + batches of tensors + """ + audio_lengths = torch.LongTensor([sample.waveform.size(0) + for sample in batch]) + transcript_lengths = torch.LongTensor([sample.transcript.size(0) + for sample in batch]) + permute_indices = torch.argsort(audio_lengths, descending=True) + + audio_lengths = audio_lengths[permute_indices] + transcript_lengths = transcript_lengths[permute_indices] + padded_audio_signals = torch.nn.utils.rnn.pad_sequence( + [batch[i].waveform for i in permute_indices], + batch_first=True + ) + transcript_list = [batch[i].transcript + for i in permute_indices] + packed_transcripts = torch.nn.utils.rnn.pack_sequence(transcript_list, + enforce_sorted=False) + + # TODO: Don't I need to stop grad at some point now? + return (padded_audio_signals, audio_lengths, transcript_list, + packed_transcripts, transcript_lengths) + + +class AudioToTextDataLayer: + """Data layer with data loader + """ + + def __init__(self, **kwargs): + featurizer_config = kwargs['featurizer_config'] + pad_to_max = kwargs.get('pad_to_max', False) + perturb_config = kwargs.get('perturb_config', None) + manifest_filepath = kwargs['manifest_filepath'] + dataset_dir = kwargs['dataset_dir'] + labels = kwargs['labels'] + batch_size = kwargs['batch_size'] + drop_last = kwargs.get('drop_last', False) + shuffle = kwargs.get('shuffle', True) + min_duration = featurizer_config.get('min_duration', 0.1) + max_duration = featurizer_config.get('max_duration', None) + normalize_transcripts = kwargs.get('normalize_transcripts', True) + trim_silence = kwargs.get('trim_silence', False) + sampler_type = kwargs.get('sampler', 'default') + speed_perturbation = featurizer_config.get('speed_perturbation', False) + sort_by_duration = sampler_type == 'bucket' + self._featurizer = WaveformFeaturizer.from_config( + featurizer_config, perturbation_configs=perturb_config) + self._dataset = AudioDataset( + dataset_dir=dataset_dir, + manifest_filepath=manifest_filepath, + labels=labels, blank_index=len(labels), + sort_by_duration=sort_by_duration, + pad_to_max=pad_to_max, + featurizer=self._featurizer, max_duration=max_duration, + min_duration=min_duration, normalize=normalize_transcripts, + trim=trim_silence, speed_perturbation=speed_perturbation) + + print('sort_by_duration', sort_by_duration) + + self._dataloader = torch.utils.data.DataLoader( + dataset=self._dataset, + batch_size=batch_size, + collate_fn=lambda b: seq_collate_fn(b), + drop_last=drop_last, + shuffle=shuffle, + num_workers=0, + pin_memory=True, + sampler=None + ) + + def __len__(self): + return len(self._dataset) + + @property + def data_iterator(self): + return self._dataloader + + +class AudioDataset(Dataset): + def __init__(self, dataset_dir, manifest_filepath, labels, featurizer, max_duration=None, pad_to_max=False, + min_duration=None, blank_index=0, max_utts=0, normalize=True, sort_by_duration=False, + trim=False, speed_perturbation=False): + """Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations + (in seconds). Each entry is a different audio sample. + Args: + dataset_dir: absolute path to dataset folder + manifest_filepath: relative path from dataset folder to manifest json as described above. + labels: String containing all the possible characters to map to + featurizer: Initialized featurizer class that converts paths of audio to feature tensors + max_duration: If audio exceeds this length, do not include in dataset + min_duration: If audio is less than this length, do not include in dataset + pad_to_max: if specified input sequences into dnn model will be padded to max_duration + blank_index: blank index for ctc loss / decoder + max_utts: Limit number of utterances + normalize: whether to normalize transcript text + sort_by_duration: whether or not to sort sequences by increasing duration + trim: if specified trims leading and trailing silence from an audio signal. + speed_perturbation: specify if using data contains speed perburbation + """ + m_paths = [manifest_filepath] + self.manifest = Manifest(dataset_dir, m_paths, labels, blank_index, pad_to_max=pad_to_max, + max_duration=max_duration, + sort_by_duration=sort_by_duration, + min_duration=min_duration, max_utts=max_utts, + normalize=normalize, speed_perturbation=speed_perturbation) + self.featurizer = featurizer + self.blank_index = blank_index + self.trim = trim + print( + "Dataset loaded with {0:.2f} hours. Filtered {1:.2f} hours.".format( + self.manifest.duration / 3600, + self.manifest.filtered_duration / 3600)) + + def __getitem__(self, index): + sample = self.manifest[index] + rn_indx = np.random.randint(len(sample['audio_filepath'])) + duration = sample['audio_duration'][rn_indx] if 'audio_duration' in sample else 0 + offset = sample['offset'] if 'offset' in sample else 0 + features = self.featurizer.process(sample['audio_filepath'][rn_indx], + offset=offset, duration=duration, + trim=self.trim) + + AudioSample = namedtuple('AudioSample', ['waveform', + 'transcript']) + return AudioSample(features, + torch.LongTensor(sample["transcript"])) + + def __len__(self): + return len(self.manifest) diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/decoders.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/decoders.py new file mode 100644 index 00000000000..81e6f650a58 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/decoders.py @@ -0,0 +1,121 @@ +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Tuple + +import torch + +import torch.nn.functional as F +from model_separable_rnnt import label_collate + + +class ScriptGreedyDecoder(torch.nn.Module): + """A greedy transducer decoder. + + Args: + blank_symbol: See `Decoder`. + model: Model to use for prediction. + max_symbols_per_step: The maximum number of symbols that can be added + to a sequence in a single time step; if set to None then there is + no limit. + cutoff_prob: Skip to next step in search if current highest character + probability is less than this. + """ + + def __init__(self, blank_index, model, max_symbols_per_step=30): + super().__init__() + # assert isinstance(model, torch.jit.ScriptModule) + # assert not model.training + self.eval() + self._model = model + self._blank_id = blank_index + self._SOS = -1 + assert max_symbols_per_step > 0 + self._max_symbols_per_step = max_symbols_per_step + + @torch.jit.export + def forward(self, x: torch.Tensor, out_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, List[List[int]]]: + """Returns a list of sentences given an input batch. + + Args: + x: A tensor of size (batch, channels, features, seq_len) + TODO was (seq_len, batch, in_features). + out_lens: list of int representing the length of each sequence + output sequence. + + Returns: + list containing batch number of sentences (strings). + """ + # Apply optional preprocessing + logits, logits_lens = self._model.encoder(x, out_lens) + + output: List[List[int]] = [] + for batch_idx in range(logits.size(0)): + inseq = logits[batch_idx, :, :].unsqueeze(1) + # inseq: TxBxF + logitlen = logits_lens[batch_idx] + sentence = self._greedy_decode(inseq, logitlen) + output.append(sentence) + + return logits, logits_lens, output + + def _greedy_decode(self, x: torch.Tensor, out_len: torch.Tensor) -> List[int]: + hidden: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + label: List[int] = [] + for time_idx in range(int(out_len.item())): + f = x[time_idx, :, :].unsqueeze(0) + + not_blank = True + symbols_added = 0 + + while not_blank and symbols_added < self._max_symbols_per_step: + g, hidden_prime = self._pred_step( + self._get_last_symb(label), + hidden + ) + logp = self._joint_step(f, g, log_normalize=False)[0, :] + + # get index k, of max prob + v, k = logp.max(0) + k = k.item() + + if k == self._blank_id: + not_blank = False + else: + label.append(k) + hidden = hidden_prime + symbols_added += 1 + + return label + + def _pred_step(self, label: int, hidden: Optional[Tuple[torch.Tensor, torch.Tensor]]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + if label == self._SOS: + return self._model.prediction(None, hidden) + if label > self._blank_id: + label -= 1 + label = torch.tensor([[label]], dtype=torch.int64) + return self._model.prediction(label, hidden) + + def _joint_step(self, enc: torch.Tensor, pred: torch.Tensor, log_normalize: bool=False) -> torch.Tensor: + logits = self._model.joint(enc, pred)[:, 0, 0, :] + if not log_normalize: + return logits + + probs = F.log_softmax(logits, dim=len(logits.shape) - 1) + + return probs + + def _get_last_symb(self, labels: List[int]) -> int: + return self._SOS if len(labels) == 0 else labels[-1] diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/helpers.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/helpers.py new file mode 100644 index 00000000000..cfe3b66f3c8 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/helpers.py @@ -0,0 +1,123 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from enum import Enum +from metrics import word_error_rate + + +class Optimization(Enum): + """Various levels of Optimization. + WARNING: This might have effect on model accuracy.""" + nothing = 0 + mxprO0 = 1 + mxprO1 = 2 + mxprO2 = 3 + mxprO3 = 4 + + +AmpOptimizations = {Optimization.mxprO0: "O0", + Optimization.mxprO1: "O1", + Optimization.mxprO2: "O2", + Optimization.mxprO3: "O3"} + + +def add_blank_label(labels): + if not isinstance(labels, list): + raise ValueError("labels must be a list of symbols") + labels.append("") + return labels + + +def __rnnt_decoder_predictions_tensor(tensor, labels): + """ + Takes output of greedy rnnt decoder and converts to strings. + Args: + tensor: model output tensor + label: A list of labels + Returns: + prediction + """ + hypotheses = [] + labels_map = dict([(i, labels[i]) for i in range(len(labels))]) + # iterate over batch + for ind in range(len(tensor)): + hypothesis = ''.join([labels_map[c] for c in tensor[ind]]) + hypotheses.append(hypothesis) + return hypotheses + + +def __gather_predictions(predictions_list: list, labels: list) -> list: + results = [] + for prediction in predictions_list: + results += __rnnt_decoder_predictions_tensor(prediction, labels=labels) + return results + + +def __gather_transcripts(transcript_list: list, transcript_len_list: list, + labels: list) -> list: + results = [] + labels_map = dict([(i, labels[i]) for i in range(len(labels))]) + for i, t in enumerate(transcript_list): + target = t.numpy().tolist() + reference = ''.join([labels_map[c] for c in target]) + results.append(reference) + return results + + +def process_evaluation_batch(tensors: dict, global_vars: dict, labels: list): + """ + Processes results of an iteration and saves it in global_vars + Args: + tensors: dictionary with results of an evaluation iteration, e.g. loss, predictions, transcript, and output + global_vars: dictionary where processes results of iteration are saved + labels: A list of labels + """ + for kv, v in tensors.items(): + if kv.startswith('predictions'): + global_vars['predictions'] += __gather_predictions( + v, labels=labels) + elif kv.startswith('transcript_length'): + transcript_len_list = v + elif kv.startswith('transcript'): + transcript_list = v + + global_vars['transcripts'] += __gather_transcripts(transcript_list, + transcript_len_list, + labels=labels) + + +def process_evaluation_epoch(global_vars: dict, tag=None): + """ + Processes results from each worker at the end of evaluation and combine to final result + Args: + global_vars: dictionary containing information of entire evaluation + Return: + wer: final word error rate + loss: final loss + """ + hypotheses = global_vars['predictions'] + references = global_vars['transcripts'] + + wer, scores, num_words = word_error_rate( + hypotheses=hypotheses, references=references) + return wer + + +def print_dict(d): + maxLen = max([len(ii) for ii in d.keys()]) + fmtString = '\t%' + str(maxLen) + 's : %s' + print('Arguments:') + for keyPair in sorted(d.items()): + print(fmtString % keyPair) diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/metrics.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/metrics.py new file mode 100644 index 00000000000..5426e37237a --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/metrics.py @@ -0,0 +1,67 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + + +def __levenshtein(a: List, b: List) -> int: + """Calculates the Levenshtein distance between a and b. + """ + n, m = len(a), len(b) + if n > m: + # Make sure n <= m, to use O(min(n,m)) space + a, b = b, a + n, m = m, n + + current = list(range(n + 1)) + for i in range(1, m + 1): + previous, current = current, [i] + [0] * n + for j in range(1, n + 1): + add, delete = previous[j] + 1, current[j - 1] + 1 + change = previous[j - 1] + if a[j - 1] != b[i - 1]: + change = change + 1 + current[j] = min(add, delete, change) + + return current[n] + + +def word_error_rate(hypotheses: List[str], references: List[str]) -> float: + """ + Computes Average Word Error rate between two texts represented as + corresponding lists of string. Hypotheses and references must have same length. + + Args: + hypotheses: list of hypotheses + references: list of references + + Returns: + (float) average word error rate + """ + scores = 0 + words = 0 + if len(hypotheses) != len(references): + raise ValueError("In word error rate calculation, hypotheses and reference" + " lists must have the same number of elements. But I got:" + "{0} and {1} correspondingly".format(len(hypotheses), len(references))) + for h, r in zip(hypotheses, references): + h_list = h.split() + r_list = r.split() + words += len(r_list) + scores += __levenshtein(h_list, r_list) + if words != 0: + wer = (1.0 * scores) / words + else: + wer = float('inf') + return wer, scores, words diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/model_separable_rnnt.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/model_separable_rnnt.py new file mode 100644 index 00000000000..68a0ed6b5e5 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/model_separable_rnnt.py @@ -0,0 +1,214 @@ +from typing import Optional, Tuple + +import numpy as np +import torch + +from rnn import rnn +from rnn import StackTime + + +class RNNT(torch.nn.Module): + def __init__(self, rnnt=None, num_classes=1, **kwargs): + super().__init__() + if kwargs.get("no_featurizer", False): + in_features = kwargs.get("in_features") + else: + feat_config = kwargs.get("feature_config") + # This may be useful in the future, for MLPerf + # configuration. + in_features = feat_config['features'] * \ + feat_config.get("frame_splicing", 1) + + self.encoder = Encoder(in_features, + rnnt["encoder_n_hidden"], + rnnt["encoder_pre_rnn_layers"], + rnnt["encoder_post_rnn_layers"], + rnnt["forget_gate_bias"], + None if "norm" not in rnnt else rnnt["norm"], + rnnt["rnn_type"], + rnnt["encoder_stack_time_factor"], + rnnt["dropout"], + ) + + self.prediction = Prediction( + num_classes, + rnnt["pred_n_hidden"], + rnnt["pred_rnn_layers"], + rnnt["forget_gate_bias"], + None if "norm" not in rnnt else rnnt["norm"], + rnnt["rnn_type"], + rnnt["dropout"], + ) + + self.joint = Joint( + num_classes, + rnnt["pred_n_hidden"], + rnnt["encoder_n_hidden"], + rnnt["joint_n_hidden"], + rnnt["dropout"], + ) + + def forward(self, x_padded: torch.Tensor, x_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + return self.encoder(x_padded, x_lens) + + +class Encoder(torch.nn.Module): + def __init__(self, in_features, encoder_n_hidden, + encoder_pre_rnn_layers, encoder_post_rnn_layers, + forget_gate_bias, norm, rnn_type, encoder_stack_time_factor, + dropout): + super().__init__() + self.pre_rnn = rnn( + rnn=rnn_type, + input_size=in_features, + hidden_size=encoder_n_hidden, + num_layers=encoder_pre_rnn_layers, + norm=norm, + forget_gate_bias=forget_gate_bias, + dropout=dropout, + ) + self.stack_time = StackTime(factor=encoder_stack_time_factor) + self.post_rnn = rnn( + rnn=rnn_type, + input_size=encoder_stack_time_factor * encoder_n_hidden, + hidden_size=encoder_n_hidden, + num_layers=encoder_post_rnn_layers, + norm=norm, + forget_gate_bias=forget_gate_bias, + norm_first_rnn=True, + dropout=dropout, + ) + + def forward(self, x_padded: torch.Tensor, x_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + x_padded, _ = self.pre_rnn(x_padded, None) + x_padded, x_lens = self.stack_time(x_padded, x_lens) + # (T, B, H) + x_padded, _ = self.post_rnn(x_padded, None) + # (B, T, H) + x_padded = x_padded.transpose(0, 1) + return x_padded, x_lens + +class Prediction(torch.nn.Module): + def __init__(self, vocab_size, n_hidden, pred_rnn_layers, + forget_gate_bias, norm, rnn_type, dropout): + super().__init__() + self.embed = torch.nn.Embedding(vocab_size - 1, n_hidden) + self.n_hidden = n_hidden + self.dec_rnn = rnn( + rnn=rnn_type, + input_size=n_hidden, + hidden_size=n_hidden, + num_layers=pred_rnn_layers, + norm=norm, + forget_gate_bias=forget_gate_bias, + dropout=dropout, + ) + + def forward(self, y: Optional[torch.Tensor], + state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + """ + B - batch size + U - label length + H - Hidden dimension size + L - Number of decoder layers = 2 + + Args: + y: (B, U) + + Returns: + Tuple (g, hid) where: + g: (B, U + 1, H) + hid: (h, c) where h is the final sequence hidden state and c is + the final cell state: + h (tensor), shape (L, B, H) + c (tensor), shape (L, B, H) + """ + if y is None: + # This is gross. I should really just pass in an SOS token + # instead. Is there no SOS token? + assert state is None + # Hacky, no way to determine this right now! + B = 1 + y = torch.zeros((B, 1, self.n_hidden), dtype=torch.float32) + else: + y = self.embed(y) + + # if state is None: + # batch = y.size(0) + # state = [ + # (torch.zeros(batch, self.pred_n_hidden, dtype=y.dtype, device=y.device), + # torch.zeros(batch, self.pred_n_hidden, dtype=y.dtype, device=y.device)) + # for _ in range(self.pred_rnn_layers) + # ] + + y = y.transpose(0, 1) # .contiguous() # (U + 1, B, H) + g, hid = self.dec_rnn(y, state) + g = g.transpose(0, 1) # .contiguous() # (B, U + 1, H) + # del y, state + return g, hid + +class Joint(torch.nn.Module): + def __init__(self, vocab_size, pred_n_hidden, enc_n_hidden, + joint_n_hidden, dropout): + super().__init__() + layers = [ + torch.nn.Linear(pred_n_hidden + enc_n_hidden, joint_n_hidden), + torch.nn.ReLU(), + ] + ([torch.nn.Dropout(p=dropout), ] if dropout else []) + [ + torch.nn.Linear(joint_n_hidden, vocab_size) + ] + self.net = torch.nn.Sequential( + *layers + ) + + def forward(self, f: torch.Tensor, g: torch.Tensor): + """ + f should be shape (B, T, H) + g should be shape (B, U + 1, H) + + returns: + logits of shape (B, T, U, K + 1) + """ + # Combine the input states and the output states + B, T, H = f.shape + B, U_, H2 = g.shape + + f = f.unsqueeze(dim=2) # (B, T, 1, H) + f = f.expand((B, T, U_, H)) + + g = g.unsqueeze(dim=1) # (B, 1, U + 1, H) + g = g.expand((B, T, U_, H2)) + + inp = torch.cat([f, g], dim=3) # (B, T, U, 2H) + res = self.net(inp) + # del f, g, inp + return res + +def label_collate(labels): + """Collates the label inputs for the rnn-t prediction network. + + If `labels` is already in torch.Tensor form this is a no-op. + + Args: + labels: A torch.Tensor List of label indexes or a torch.Tensor. + + Returns: + A padded torch.Tensor of shape (batch, max_seq_len). + """ + + if isinstance(labels, torch.Tensor): + return labels.type(torch.int64) + if not isinstance(labels, (list, tuple)): + raise ValueError( + f"`labels` should be a list or tensor not {type(labels)}" + ) + + batch_size = len(labels) + max_len = max(len(l) for l in labels) + + cat_labels = np.full((batch_size, max_len), fill_value=0.0, dtype=np.int32) + for e, l in enumerate(labels): + cat_labels[e, :len(l)] = l + labels = torch.LongTensor(cat_labels) + + return labels diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/features.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/features.py new file mode 100644 index 00000000000..5a1309758eb --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/features.py @@ -0,0 +1,260 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Tuple + +import torch +import torch.nn as nn +import math +import librosa +from .segment import AudioSegment + + +class WaveformFeaturizer(object): + def __init__(self, input_cfg): + self.cfg = input_cfg + + def process(self, file_path, offset=0, duration=0, trim=False): + audio = AudioSegment.from_file(file_path, + target_sr=self.cfg['sample_rate'], + int_values=self.cfg.get( + 'int_values', False), + offset=offset, duration=duration, trim=trim) + return self.process_segment(audio) + + def process_segment(self, audio_segment): + return torch.tensor(audio_segment.samples, dtype=torch.float) + + @classmethod + def from_config(cls, input_config, perturbation_configs=None): + return cls(input_config) + + +constant = 1e-5 + + +def normalize_batch(x, seq_len, normalize_type): + if normalize_type == "per_feature": + x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, + device=x.device) + x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, + device=x.device) + for i in range(x.shape[0]): + x_mean[i, :] = x[i, :, :seq_len[i]].mean(dim=1) + x_std[i, :] = x[i, :, :seq_len[i]].std(dim=1) + # make sure x_std is not zero + x_std += constant + return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2) + elif normalize_type == "all_features": + x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device) + x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device) + for i in range(x.shape[0]): + x_mean[i] = x[i, :, :seq_len[i].item()].mean() + x_std[i] = x[i, :, :seq_len[i].item()].std() + # make sure x_std is not zero + x_std += constant + return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1) + else: + return x + + +def splice_frames(x, frame_splicing): + """ Stacks frames together across feature dim + + input is batch_size, feature_dim, num_frames + output is batch_size, feature_dim*frame_splicing, num_frames + + """ + seq = [x] + for n in range(1, frame_splicing): + tmp = torch.zeros_like(x) + tmp[:, :, :-n] = x[:, :, n:] + seq.append(tmp) + return torch.cat(seq, dim=1)[:, :, ::frame_splicing] + + +class FilterbankFeatures(nn.Module): + def __init__(self, sample_rate=8000, window_size=0.02, window_stride=0.01, + window="hamming", normalize="per_feature", n_fft=None, + preemph=0.97, + nfilt=64, lowfreq=0, highfreq=None, log=True, dither=constant, + pad_to=8, + max_duration=16.7, + frame_splicing=1): + super(FilterbankFeatures, self).__init__() +# print("PADDING: {}".format(pad_to)) + + torch_windows = { + 'hann': torch.hann_window, + 'hamming': torch.hamming_window, + 'blackman': torch.blackman_window, + 'bartlett': torch.bartlett_window, + 'none': None, + } + + self.win_length = int(sample_rate * window_size) # frame size + self.hop_length = int(sample_rate * window_stride) + self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length)) + + self.normalize = normalize + self.log = log + self.dither = dither + self.frame_splicing = frame_splicing + self.nfilt = nfilt + self.preemph = preemph + self.pad_to = pad_to + # For now, always enable this. + # See https://docs.google.com/presentation/d/1IVC3J-pHB-ipJpKsJox_SqmDHYdkIaoCXTbKmJmV2-I/edit?usp=sharing for elaboration + self.use_deterministic_dithering = True + highfreq = highfreq or sample_rate / 2 + window_fn = torch_windows.get(window, None) + window_tensor = window_fn(self.win_length, + periodic=False) if window_fn else None + filterbanks = torch.tensor( + librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt, fmin=lowfreq, + fmax=highfreq), dtype=torch.float).unsqueeze(0) + # self.fb = filterbanks + # self.window = window_tensor + self.register_buffer("fb", filterbanks) + self.register_buffer("window", window_tensor) + # Calculate maximum sequence length (# frames) + max_length = 1 + math.ceil( + (max_duration * sample_rate - self.win_length) / self.hop_length + ) + max_pad = 16 - (max_length % 16) + self.max_length = max_length + max_pad + + def get_seq_len(self, seq_len): + seq_len = (seq_len + self.hop_length - 1) // self.hop_length + seq_len = (seq_len + self.frame_splicing - 1) // self.frame_splicing + return seq_len + + @torch.no_grad() + def forward(self, inp: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor: + x, seq_len = inp + + dtype = x.dtype + + seq_len = self.get_seq_len(seq_len) + + # dither + if self.dither > 0 and not self.use_deterministic_dithering: + x += self.dither * torch.randn_like(x) + + # do preemphasis + # Ideally, we would mask immediately after this... Ugh :( + if self.preemph is not None: + x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), + dim=1) + + # do stft + x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, + win_length=self.win_length, + center=True, window=self.window.to(dtype=torch.float), + return_complex=False + ) + # get power spectrum + x = x.pow(2).sum(-1) + + if self.dither > 0 and self.use_deterministic_dithering: + x = x + self.dither ** 2 + # dot with filterbank energies + x = torch.matmul(self.fb.to(x.dtype), x) + + # log features if required + if self.log: + x = torch.log(x + 1e-20) + + # frame splicing if required + if self.frame_splicing > 1: + seq = [x] + for n in range(1, self.frame_splicing): + tmp = torch.zeros_like(x) + tmp[:, :, :-n] = x[:, :, n:] + seq.append(tmp) + x = torch.cat(seq, dim=1)[:, :, ::self.frame_splicing] + + # normalize if required + constant = 1e-5 + if self.normalize == "per_feature": + x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, + device=x.device) + x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, + device=x.device) + for i in range(x.shape[0]): + x_mean[i, :] = x[i, :, :seq_len[i]].mean(dim=1) + x_std[i, :] = x[i, :, :seq_len[i]].std(dim=1) + # make sure x_std is not zero + x_std += constant + x = (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2) + elif self.normalize == "all_features": + x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device) + x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device) + for i in range(x.shape[0]): + x_mean[i] = x[i, :, :seq_len[i].item()].mean() + x_std[i] = x[i, :, :seq_len[i].item()].std() + # make sure x_std is not zero + x_std += constant + x = (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1) + else: + x = x + + # Hmmm... They don't do any masking anymore. Seems concerning! + + # mask to zero any values beyond seq_len in batch, pad to multiple of `pad_to` (for efficiency) + # max_len = x.size(-1) + x = x[:, :, :seq_len.max()] # rnnt loss requires lengths to match + # mask = torch.arange(max_len).to(seq_len.dtype).to(x.device).expand(x.size(0), + # max_len) >= seq_len.unsqueeze(1) + + # x = x.masked_fill(mask.unsqueeze(1).to(device=x.device), 0) + pad_to = self.pad_to + if pad_to != 0: + raise NotImplementedError() + # if pad_to == "max": + # x = nn.functional.pad(x, (0, self.max_length - x.size(-1))) + # elif pad_to > 0: + # pad_amt = x.size(-1) % pad_to + # if pad_amt != 0: + # x = nn.functional.pad(x, (0, pad_to - pad_amt)) + + return x.to(dtype) + + @classmethod + def from_config(cls, cfg, log=False): + return cls(sample_rate=cfg['sample_rate'], window_size=cfg['window_size'], + window_stride=cfg['window_stride'], n_fft=cfg['n_fft'], + nfilt=cfg['features'], window=cfg['window'], + normalize=cfg['normalize'], + max_duration=cfg.get('max_duration', 16.7), + dither=cfg['dither'], pad_to=cfg.get("pad_to", 0), + frame_splicing=cfg.get("frame_splicing", 1), log=log) + + +class FeatureFactory(object): + featurizers = { + "logfbank": FilterbankFeatures, + "fbank": FilterbankFeatures, + } + + def __init__(self): + pass + + @classmethod + def from_config(cls, cfg): + feat_type = cfg.get('feat_type', "logspect") + featurizer = cls.featurizers[feat_type] + # return featurizer.from_config(cfg, log="log" in cfg['feat_type']) + return featurizer.from_config(cfg, log="log" in feat_type) diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/manifest.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/manifest.py new file mode 100644 index 00000000000..fb04c5da882 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/manifest.py @@ -0,0 +1,176 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import string +import os + +from .text import _clean_text + + +def normalize_string(s, labels, table, **unused_kwargs): + """ + Normalizes string. For example: + 'call me at 8:00 pm!' -> 'call me at eight zero pm' + + Args: + s: string to normalize + labels: labels used during model training. + + Returns: + Normalized string + """ + + def good_token(token, labels): + s = set(labels) + for t in token: + if t not in s: + return False + return True + + try: + text = _clean_text(s, ["english_cleaners"], table).strip() + return ''.join([t for t in text if good_token(t, labels=labels)]) + except: + print("WARNING: Normalizing {} failed".format(s)) + return None + + +class Manifest(object): + def __init__(self, data_dir, manifest_paths, labels, blank_index, max_duration=None, pad_to_max=False, + min_duration=None, sort_by_duration=False, max_utts=0, + normalize=True, speed_perturbation=False, filter_speed=1.0): + self.labels_map = dict([(labels[i], i) for i in range(len(labels))]) + self.blank_index = blank_index + self.max_duration = max_duration + ids = [] + duration = 0.0 + filtered_duration = 0.0 + + # If removing punctuation, make a list of punctuation to remove + table = None + if normalize: + # Punctuation to remove + punctuation = string.punctuation + punctuation = punctuation.replace("+", "") + punctuation = punctuation.replace("&", "") + # We might also want to consider: + # @ -> at + # -> number, pound, hashtag + # ~ -> tilde + # _ -> underscore + # % -> percent + # If a punctuation symbol is inside our vocab, we do not remove from text + for l in labels: + punctuation = punctuation.replace(l, "") + # Turn all punctuation to whitespace + table = str.maketrans(punctuation, " " * len(punctuation)) + for manifest_path in manifest_paths: + with open(manifest_path, "r", encoding="utf-8") as fh: + a = json.load(fh) + for data in a: + files_and_speeds = data['files'] + + if pad_to_max: + if not speed_perturbation: + min_speed = filter_speed + else: + min_speed = min(x['speed'] + for x in files_and_speeds) + max_duration = self.max_duration * min_speed + + data['duration'] = data['original_duration'] + if min_duration is not None and data['duration'] < min_duration: + filtered_duration += data['duration'] + continue + if max_duration is not None and data['duration'] > max_duration: + filtered_duration += data['duration'] + continue + + # Prune and normalize according to transcript + transcript_text = data[ + 'transcript'] if "transcript" in data else self.load_transcript( + data['text_filepath']) + if normalize: + transcript_text = normalize_string(transcript_text, labels=labels, + table=table) + if not isinstance(transcript_text, str): + print( + "WARNING: Got transcript: {}. It is not a string. Dropping data point".format( + transcript_text)) + filtered_duration += data['duration'] + continue + data["transcript"] = self.parse_transcript( + transcript_text) # convert to vocab indices + + if speed_perturbation: + audio_paths = [x['fname'] for x in files_and_speeds] + data['audio_duration'] = [x['duration'] + for x in files_and_speeds] + else: + audio_paths = [ + x['fname'] for x in files_and_speeds if x['speed'] == filter_speed] + data['audio_duration'] = [x['duration'] + for x in files_and_speeds if x['speed'] == filter_speed] + data['audio_filepath'] = [os.path.join( + data_dir, x) for x in audio_paths] + data.pop('files') + data.pop('original_duration') + + ids.append(data) + duration += data['duration'] + + if max_utts > 0 and len(ids) >= max_utts: + print( + 'Stopping parsing %s as max_utts=%d' % (manifest_path, max_utts)) + break + + if sort_by_duration: + ids = sorted(ids, key=lambda x: x['duration']) + self._data = ids + self._size = len(ids) + self._duration = duration + self._filtered_duration = filtered_duration + + def load_transcript(self, transcript_path): + with open(transcript_path, 'r', encoding="utf-8") as transcript_file: + transcript = transcript_file.read().replace('\n', '') + return transcript + + def parse_transcript(self, transcript): + chars = [self.labels_map.get(x, self.blank_index) + for x in list(transcript)] + transcript = list(filter(lambda x: x != self.blank_index, chars)) + return transcript + + def __getitem__(self, item): + return self._data[item] + + def __len__(self): + return self._size + + def __iter__(self): + return iter(self._data) + + @property + def duration(self): + return self._duration + + @property + def filtered_duration(self): + return self._filtered_duration + + @property + def data(self): + return list(self._data) diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/segment.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/segment.py new file mode 100644 index 00000000000..08aa5c6a492 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/segment.py @@ -0,0 +1,170 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import librosa +import soundfile as sf + + +class AudioSegment(object): + """Monaural audio segment abstraction. + :param samples: Audio samples [num_samples x num_channels]. + :type samples: ndarray.float32 + :param sample_rate: Audio sample rate. + :type sample_rate: int + :raises TypeError: If the sample data type is not float or int. + """ + + def __init__(self, samples, sample_rate, target_sr=None, trim=False, + trim_db=60): + """Create audio segment from samples. + Samples are convert float32 internally, with int scaled to [-1, 1]. + """ + samples = self._convert_samples_to_float32(samples) + if target_sr is not None and target_sr != sample_rate: + samples = librosa.core.resample(samples, sample_rate, target_sr) + sample_rate = target_sr + if trim: + samples, _ = librosa.effects.trim(samples, trim_db) + self._samples = samples + self._sample_rate = sample_rate + if self._samples.ndim >= 2: + self._samples = np.mean(self._samples, 1) + + def __eq__(self, other): + """Return whether two objects are equal.""" + if type(other) is not type(self): + return False + if self._sample_rate != other._sample_rate: + return False + if self._samples.shape != other._samples.shape: + return False + if np.any(self.samples != other._samples): + return False + return True + + def __ne__(self, other): + """Return whether two objects are unequal.""" + return not self.__eq__(other) + + def __str__(self): + """Return human-readable representation of segment.""" + return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, " + "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate, + self.duration, self.rms_db)) + + @staticmethod + def _convert_samples_to_float32(samples): + """Convert sample type to float32. + Audio sample type is usually integer or float-point. + Integers will be scaled to [-1, 1] in float32. + """ + float32_samples = samples.astype('float32') + if samples.dtype in np.sctypes['int']: + bits = np.iinfo(samples.dtype).bits + float32_samples *= (1. / 2 ** (bits - 1)) + elif samples.dtype in np.sctypes['float']: + pass + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return float32_samples + + @classmethod + def from_file(cls, filename, target_sr=None, int_values=False, offset=0, + duration=0, trim=False): + """ + Load a file supported by librosa and return as an AudioSegment. + :param filename: path of file to load + :param target_sr: the desired sample rate + :param int_values: if true, load samples as 32-bit integers + :param offset: offset in seconds when loading audio + :param duration: duration in seconds when loading audio + :return: numpy array of samples + """ + with sf.SoundFile(filename, 'r') as f: + dtype = 'int32' if int_values else 'float32' + sample_rate = f.samplerate + if offset > 0: + f.seek(int(offset * sample_rate)) + if duration > 0: + samples = f.read(int(duration * sample_rate), dtype=dtype) + else: + samples = f.read(dtype=dtype) + samples = samples.transpose() + return cls(samples, sample_rate, target_sr=target_sr, trim=trim) + + @property + def samples(self): + return self._samples.copy() + + @property + def sample_rate(self): + return self._sample_rate + + @property + def num_samples(self): + return self._samples.shape[0] + + @property + def duration(self): + return self._samples.shape[0] / float(self._sample_rate) + + @property + def rms_db(self): + mean_square = np.mean(self._samples ** 2) + return 10 * np.log10(mean_square) + + def gain_db(self, gain): + self._samples *= 10. ** (gain / 20.) + + def pad(self, pad_size, symmetric=False): + """Add zero padding to the sample. The pad size is given in number of samples. + If symmetric=True, `pad_size` will be added to both sides. If false, `pad_size` + zeros will be added only to the end. + """ + self._samples = np.pad(self._samples, + (pad_size if symmetric else 0, pad_size), + mode='constant') + + def subsegment(self, start_time=None, end_time=None): + """Cut the AudioSegment between given boundaries. + Note that this is an in-place transformation. + :param start_time: Beginning of subsegment in seconds. + :type start_time: float + :param end_time: End of subsegment in seconds. + :type end_time: float + :raise ValueError: If start_time or end_time is incorrectly set, e.g. out + of bounds in time. + """ + start_time = 0.0 if start_time is None else start_time + end_time = self.duration if end_time is None else end_time + if start_time < 0.0: + start_time = self.duration + start_time + if end_time < 0.0: + end_time = self.duration + end_time + if start_time < 0.0: + raise ValueError("The slice start position (%f s) is out of " + "bounds." % start_time) + if end_time < 0.0: + raise ValueError("The slice end position (%f s) is out of bounds." % + end_time) + if start_time > end_time: + raise ValueError("The slice start position (%f s) is later than " + "the end position (%f s)." % (start_time, end_time)) + if end_time > self.duration: + raise ValueError("The slice end position (%f s) is out of bounds " + "(> %f s)" % (end_time, self.duration)) + start_sample = int(round(start_time * self._sample_rate)) + end_sample = int(round(end_time * self._sample_rate)) + self._samples = self._samples[start_sample:end_sample] diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/LICENSE b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/LICENSE new file mode 100644 index 00000000000..4ad4ed1d5e3 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2017 Keith Ito + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/__init__.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/__init__.py new file mode 100644 index 00000000000..61936879a95 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) 2017 Keith Ito +""" from https://github.com/keithito/tacotron """ +from . import cleaners + + +def _clean_text(text, cleaner_names, *args): + for name in cleaner_names: + cleaner = getattr(cleaners, name) + if not cleaner: + raise Exception('Unknown cleaner: %s' % name) + text = cleaner(text, *args) + return text diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/cleaners.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/cleaners.py new file mode 100644 index 00000000000..e1e52af5f37 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/cleaners.py @@ -0,0 +1,116 @@ +# Copyright (c) 2017 Keith Ito +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" from https://github.com/keithito/tacotron +Modified to add puncturation removal +""" + +''' +Cleaners are transformations that run over the input text at both training and eval time. + +Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" +hyperparameter. Some cleaners are English-specific. You'll typically want to use: + 1. "english_cleaners" for English text + 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using + the Unidecode library (https://pypi.python.org/pypi/Unidecode) + 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update + the symbols in symbols.py to match your data). + +''' + + +# Regular expression matching whitespace: +import re +from text_unidecode import unidecode +from .numbers import normalize_numbers +_whitespace_re = re.compile(r'\s+') + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ + ('mrs', 'misess'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), +]] + + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + + +def expand_numbers(text): + return normalize_numbers(text) + + +def lowercase(text): + return text.lower() + + +def collapse_whitespace(text): + return re.sub(_whitespace_re, ' ', text) + + +def convert_to_ascii(text): + return unidecode(text) + + +def remove_punctuation(text, table): + text = text.translate(table) + text = re.sub(r'&', " and ", text) + text = re.sub(r'\+', " plus ", text) + return text + + +def basic_cleaners(text): + '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def transliteration_cleaners(text): + '''Pipeline for non-English text that transliterates to ASCII.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def english_cleaners(text, table=None): + '''Pipeline for English text, including number and abbreviation expansion.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + if table is not None: + text = remove_punctuation(text, table) + text = collapse_whitespace(text) + return text diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/numbers.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/numbers.py new file mode 100644 index 00000000000..d4b2f0d749f --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/parts/text/numbers.py @@ -0,0 +1,101 @@ +# Copyright (c) 2017 Keith Ito +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" from https://github.com/keithito/tacotron +Modified to add support for time and slight tweaks to _expand_number +""" + +import inflect +import re + + +_inflect = inflect.engine() +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') +_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') +_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_number_re = re.compile(r'[0-9]+') +_time_re = re.compile(r'([0-9]{1,2}):([0-9]{2})') + + +def _remove_commas(m): + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m): + return m.group(1).replace('.', ' point ') + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + elif cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + else: + return 'zero dollars' + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m): + if int(m.group(0)[0]) == 0: + return _inflect.number_to_words(m.group(0), andword='', group=1) + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return 'two thousand' + elif num > 2000 and num < 2010: + return 'two thousand ' + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + ' hundred' + else: + return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + # Add check for number phones and other large numbers + elif num > 1000000000 and num % 10000 != 0: + return _inflect.number_to_words(num, andword='', group=1) + else: + return _inflect.number_to_words(num, andword='') + + +def _expand_time(m): + mins = int(m.group(2)) + if mins == 0: + return _inflect.number_to_words(m.group(1)) + return " ".join([_inflect.number_to_words(m.group(1)), _inflect.number_to_words(m.group(2))]) + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r'\1 pounds', text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + text = re.sub(_time_re, _expand_time, text) + return text diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/preprocessing.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/preprocessing.py new file mode 100644 index 00000000000..581885466b0 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/preprocessing.py @@ -0,0 +1,39 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Tuple + +import torch +import torch.nn as nn + +from helpers import Optimization +from parts.features import FeatureFactory + + +class AudioPreprocessing(nn.Module): + """GPU accelerated audio preprocessing + """ + + def __init__(self, **kwargs): + nn.Module.__init__(self) # For PyTorch API + self.optim_level = kwargs.get( + 'optimization_level', Optimization.nothing) + self.featurizer = FeatureFactory.from_config(kwargs) + + def forward(self, x: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]: + input_signal, length = x + length.requires_grad_(False) + processed_signal = self.featurizer(x) + processed_length = self.featurizer.get_seq_len(length) + return processed_signal, processed_length diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/rnn.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/rnn.py new file mode 100644 index 00000000000..ba7cb19c912 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/rnn.py @@ -0,0 +1,110 @@ +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from typing import Optional, Tuple + + +def rnn(rnn, input_size, hidden_size, num_layers, norm=None, + forget_gate_bias=1.0, dropout=0.0, **kwargs): + """TODO""" + if rnn != "lstm": + raise ValueError(f"Unknown rnn={rnn}") + if norm not in [None]: + raise ValueError(f"unknown norm={norm}") + + if rnn == "lstm": + return LstmDrop( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + dropout=dropout, + forget_gate_bias=forget_gate_bias, + **kwargs + ) + + +class LstmDrop(torch.nn.Module): + + def __init__(self, input_size, hidden_size, num_layers, dropout, forget_gate_bias, + **kwargs): + """Returns an LSTM with forget gate bias init to `forget_gate_bias`. + + Args: + input_size: See `torch.nn.LSTM`. + hidden_size: See `torch.nn.LSTM`. + num_layers: See `torch.nn.LSTM`. + dropout: See `torch.nn.LSTM`. + forget_gate_bias: For each layer and each direction, the total value of + to initialise the forget gate bias to. + + Returns: + A `torch.nn.LSTM`. + """ + super(LstmDrop, self).__init__() + + self.lstm = torch.nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + dropout=dropout, + ) + if forget_gate_bias is not None: + for name, v in self.lstm.named_parameters(): + if "bias_ih" in name: + bias = getattr(self.lstm, name) + bias.data[hidden_size:2 * hidden_size].fill_(forget_gate_bias) + if "bias_hh" in name: + bias = getattr(self.lstm, name) + bias.data[hidden_size:2 * hidden_size].fill_(0) + + if dropout: + self.inplace_dropout = torch.nn.Dropout(dropout, inplace=True) + else: + self.inplace_droput = None + + def forward(self, x: torch.Tensor, + h: Optional[Tuple[torch.Tensor, torch.Tensor]] = None): + if hasattr(self.lstm, "module"): + x, h = self.lstm.module(x, h) + else: + x, h = self.lstm(x, h) + + if self.inplace_dropout is not None: + self.inplace_dropout(x.data) + + return x, h + + +class StackTime(torch.nn.Module): + + __constants__ = ["factor"] + + def __init__(self, factor): + super().__init__() + self.factor = int(factor) + + def forward(self, x, x_lens): + # T, B, U + seq = [x] + for i in range(1, self.factor): + # This doesn't seem to make much sense... + tmp = torch.zeros_like(x) + tmp[:-i, :, :] = x[i:, :, :] + seq.append(tmp) + x_lens = torch.ceil(x_lens.float() / self.factor).int() + # Gross, this is horrible. What a waste of memory... + return torch.cat(seq, dim=2)[::self.factor, :, :], x_lens diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/docker/build.sh b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/docker/build.sh new file mode 100644 index 00000000000..cfdc97c010e --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/docker/build.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +docker build . --rm -t jasper \ No newline at end of file diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/docker/launch.sh b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/docker/launch.sh new file mode 100644 index 00000000000..5c9c6a3f346 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/docker/launch.sh @@ -0,0 +1,32 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#!/bin/bash + +DATA_DIR=$1 +CHECKPOINT_DIR=$2 +RESULT_DIR=$3 + +docker run -it --rm \ + --gpus='"device=1"' \ + --shm-size=4g \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + -v "$DATA_DIR":/datasets \ + -v "$CHECKPOINT_DIR":/checkpoints/ \ + -v "$RESULT_DIR":/results/ \ + -v $PWD:/code \ + -v $PWD:/workspace/jasper \ + mlperf-rnnt-ref bash diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/download_librispeech.sh b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/download_librispeech.sh new file mode 100644 index 00000000000..ee322fe3043 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/download_librispeech.sh @@ -0,0 +1,28 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#!/usr/bin/env bash + +DATA_SET="LibriSpeech" +DATA_ROOT_DIR="/datasets" +DATA_DIR="${DATA_ROOT_DIR}/${DATA_SET}" +if [ ! -d "$DATA_DIR" ] +then + mkdir $DATA_DIR + chmod go+rx $DATA_DIR + python utils/download_librispeech.py utils/librispeech.csv $DATA_DIR -e ${DATA_ROOT_DIR}/ +else + echo "Directory $DATA_DIR already exists." +fi diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/evaluation.sh b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/evaluation.sh new file mode 100644 index 00000000000..fcd472fd9aa --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/evaluation.sh @@ -0,0 +1,92 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#!/bin/bash +echo "Container nvidia build = " $NVIDIA_BUILD_ID + +DATA_DIR=${1:-"/datasets/LibriSpeech"} +DATASET=${2:-"dev-clean"} +MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"} +RESULT_DIR=${4:-"/results"} +CHECKPOINT=$5 +CREATE_LOGFILE=${6:-"true"} +CUDNN_BENCHMARK=${7:-"false"} +NUM_GPUS=${8:-1} +PRECISION=${9:-"fp32"} +NUM_STEPS=${10:-"-1"} +SEED=${11:-0} +BATCH_SIZE=${12:-64} + + +if [ "$CREATE_LOGFILE" = "true" ] ; then + export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS) + printf -v TAG "jasper_evaluation_${DATASET}_%s_gbs%d" "$PRECISION" $GBS + DATESTAMP=`date +'%y%m%d%H%M%S'` + LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log" + printf "Logs written to %s\n" "$LOGFILE" +fi + + + +PREC="" +if [ "$PRECISION" = "fp16" ] ; then + PREC="--fp16" +elif [ "$PRECISION" = "fp32" ] ; then + PREC="" +else + echo "Unknown argument" + exit -2 +fi + +STEPS="" +if [ "$NUM_STEPS" -gt 0 ] ; then + STEPS=" --steps $NUM_STEPS" +fi + +if [ "$CUDNN_BENCHMARK" = "true" ] ; then + CUDNN_BENCHMARK=" --cudnn_benchmark" +else + CUDNN_BENCHMARK="" +fi + + +CMD=" inference.py " +CMD+=" --batch_size $BATCH_SIZE " +CMD+=" --dataset_dir $DATA_DIR " +CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json " +CMD+=" --model_toml $MODEL_CONFIG " +CMD+=" --seed $SEED " +CMD+=" --ckpt $CHECKPOINT " +CMD+=" $CUDNN_BENCHMARK" +CMD+=" $PREC " +CMD+=" $STEPS " + + +if [ "$NUM_GPUS" -gt 1 ] ; then + CMD="python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS $CMD" +else + CMD="python3 $CMD" +fi + + +set -x +if [ -z "$LOGFILE" ] ; then + $CMD +else + ( + $CMD + ) |& tee "$LOGFILE" +fi +set +x diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/inference.sh b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/inference.sh new file mode 100644 index 00000000000..2d4474ce2b7 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/inference.sh @@ -0,0 +1,104 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#!/bin/bash +echo "Container nvidia build = " $NVIDIA_BUILD_ID + + +DATA_DIR=${1-"/datasets/LibriSpeech"} +DATASET=${2:-"dev-clean"} +MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"} +RESULT_DIR=${4:-"/results"} +CHECKPOINT=$5 +CREATE_LOGFILE=${6:-"true"} +CUDNN_BENCHMARK=${7:-"false"} +PRECISION=${8:-"fp32"} +NUM_STEPS=${9:-"-1"} +SEED=${10:-0} +BATCH_SIZE=${11:-64} +MODELOUTPUT_FILE=${12:-"none"} +PREDICTION_FILE=${13:-"$RESULT_DIR/${DATASET}.predictions"} + +if [ "$CREATE_LOGFILE" = "true" ] ; then + export GBS=$(expr $BATCH_SIZE) + printf -v TAG "jasper_inference_${DATASET}_%s_gbs%d" "$PRECISION" $GBS + DATESTAMP=`date +'%y%m%d%H%M%S'` + LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log" + printf "Logs written to %s\n" "$LOGFILE" +fi + + + +PREC="" +if [ "$PRECISION" = "fp16" ] ; then + PREC="--fp16" +elif [ "$PRECISION" = "fp32" ] ; then + PREC="" +else + echo "Unknown argument" + exit -2 +fi + +PRED="" +if [ "$PREDICTION_FILE" = "none" ] ; then + PRED="" +else + PRED=" --save_prediction $PREDICTION_FILE" +fi + +OUTPUT="" +if [ "$MODELOUTPUT_FILE" = "none" ] ; then + OUTPUT=" " +else + OUTPUT=" --logits_save_to $MODELOUTPUT_FILE" +fi + + +if [ "$CUDNN_BENCHMARK" = "true" ]; then + CUDNN_BENCHMARK=" --cudnn_benchmark" +else + CUDNN_BENCHMARK="" +fi + +STEPS="" +if [ "$NUM_STEPS" -gt 0 ] ; then + STEPS=" --steps $NUM_STEPS" +fi + +CMD=" python inference.py " +CMD+=" --batch_size $BATCH_SIZE " +CMD+=" --dataset_dir $DATA_DIR " +CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json " +CMD+=" --model_toml $MODEL_CONFIG " +CMD+=" --seed $SEED " +CMD+=" --ckpt $CHECKPOINT " +CMD+=" $CUDNN_BENCHMARK" +CMD+=" $PRED " +CMD+=" $OUTPUT " +CMD+=" $PREC " +CMD+=" $STEPS " + + +set -x +if [ -z "$LOGFILE" ] ; then + $CMD +else + ( + $CMD + ) |& tee "$LOGFILE" +fi +set +x +echo "MODELOUTPUT_FILE: ${MODELOUTPUT_FILE}" +echo "PREDICTION_FILE: ${PREDICTION_FILE}" diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/inference_benchmark.sh b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/inference_benchmark.sh new file mode 100644 index 00000000000..7aeea84c159 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/inference_benchmark.sh @@ -0,0 +1,84 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#!/bin/bash + +echo "Container nvidia build = " $NVIDIA_BUILD_ID + + +DATA_DIR=${1:-"/datasets/LibriSpeech"} +DATASET=${2:-"dev-clean"} +MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"} +RESULT_DIR=${4:-"/results"} +CHECKPOINT=$5 +CREATE_LOGFILE=${6:-"true"} +CUDNN_BENCHMARK=${7:-"true"} +PRECISION=${8:-"fp32"} +NUM_STEPS=${9:-"-1"} +MAX_DURATION=${10:-"36"} +SEED=${11:-0} +BATCH_SIZE=${12:-64} + +PREC="" +if [ "$PRECISION" = "fp16" ] ; then + PREC="--fp16" +elif [ "$PRECISION" = "fp32" ] ; then + PREC="" +else + echo "Unknown argument" + exit -2 +fi +STEPS="" +if [ "$NUM_STEPS" -gt 0 ] ; then + STEPS=" --steps $NUM_STEPS" +fi +if [ "$CUDNN_BENCHMARK" = "true" ] ; then + CUDNN_BENCHMARK=" --cudnn_benchmark" +else + CUDNN_BENCHMARK="" +fi + +CMD=" python inference_benchmark.py" +CMD+=" --batch_size=$BATCH_SIZE" +CMD+=" --model_toml=$MODEL_CONFIG" +CMD+=" --seed=$SEED" +CMD+=" --dataset_dir=$DATA_DIR" +CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json " +CMD+=" --ckpt=$CHECKPOINT" +CMD+=" --max_duration=$MAX_DURATION" +CMD+=" --pad_to=-1" +CMD+=" $CUDNN_BENCHMARK" +CMD+=" $PREC" +CMD+=" $STEPS" + + +if [ "$CREATE_LOGFILE" = "true" ] ; then + export GBS=$(expr $BATCH_SIZE ) + printf -v TAG "jasper_inference_benchmark_%s_gbs%d" "$PRECISION" $GBS + DATESTAMP=`date +'%y%m%d%H%M%S'` + LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log" + printf "Logs written to %s\n" "$LOGFILE" +fi + +set -x +if [ -z "$LOGFILE" ] ; then + $CMD +else + ( + $CMD + ) |& tee "$LOGFILE" + grep 'latency' "$LOGFILE" +fi +set +x diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/preprocess_librispeech.sh b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/preprocess_librispeech.sh new file mode 100644 index 00000000000..7cfe5cc6a57 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/preprocess_librispeech.sh @@ -0,0 +1,51 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env bash + +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/train-clean-100 \ + --dest_dir /datasets/LibriSpeech/train-clean-100-wav \ + --output_json /datasets/LibriSpeech/librispeech-train-clean-100-wav.json \ + --speed 0.9 1.1 +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/train-clean-360 \ + --dest_dir /datasets/LibriSpeech/train-clean-360-wav \ + --output_json /datasets/LibriSpeech/librispeech-train-clean-360-wav.json \ + --speed 0.9 1.1 +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/train-other-500 \ + --dest_dir /datasets/LibriSpeech/train-other-500-wav \ + --output_json /datasets/LibriSpeech/librispeech-train-other-500-wav.json \ + --speed 0.9 1.1 + + +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/dev-clean \ + --dest_dir /datasets/LibriSpeech/dev-clean-wav \ + --output_json /datasets/LibriSpeech/librispeech-dev-clean-wav.json +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/dev-other \ + --dest_dir /datasets/LibriSpeech/dev-other-wav \ + --output_json /datasets/LibriSpeech/librispeech-dev-other-wav.json + + +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/test-clean \ + --dest_dir /datasets/LibriSpeech/test-clean-wav \ + --output_json /datasets/LibriSpeech/librispeech-test-clean-wav.json +python ./utils/convert_librispeech.py \ + --input_dir /datasets/LibriSpeech/test-other \ + --dest_dir /datasets/LibriSpeech/test-other-wav \ + --output_json /datasets/LibriSpeech/librispeech-test-other-wav.json diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/train.sh b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/train.sh new file mode 100644 index 00000000000..d59ce8ebeb2 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/train.sh @@ -0,0 +1,113 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#!/bin/bash +echo "Container nvidia build = " $NVIDIA_BUILD_ID + +DATA_DIR=${1:-"/datasets/LibriSpeech"} +MODEL_CONFIG=${2:-"configs/rnnt.toml"} +RESULT_DIR=${3:-"/results"} +CHECKPOINT=${4:-"none"} +CREATE_LOGFILE=${5:-"true"} +CUDNN_BENCHMARK=${6:-"true"} +NUM_GPUS=${7:-8} +PRECISION=${8:-"fp16"} +EPOCHS=${9:-100} +SEED=${10:-6} +BATCH_SIZE=${11:-8} +EVAL_BATCH_SIZE=${11:-2} +LEARNING_RATE=${12:-"0.001"} +LEARNING_RATE_WARMUP=${12:-"8000"} +GRADIENT_ACCUMULATION_STEPS=${13:-1} +LAUNCH_OPT=${LAUNCH_OPT:-"none"} + + +PREC="" +if [ "$PRECISION" = "fp16" ] ; then + PREC="--fp16" +elif [ "$PRECISION" = "fp32" ] ; then + PREC="" +else + echo "Unknown argument" + exit -2 +fi + +CUDNN="" +if [ "$CUDNN_BENCHMARK" = "true" ] && [ "$PRECISION" = "fp16" ]; then + CUDNN=" --cudnn" +else + CUDNN="" +fi + + + +if [ "$CHECKPOINT" = "none" ] ; then + CHECKPOINT="" +else + CHECKPOINT=" --ckpt=${CHECKPOINT}" +fi + + +CMD=" train.py" +CMD+=" --batch_size=$BATCH_SIZE" +CMD+=" --eval_batch_size=$EVAL_BATCH_SIZE" +CMD+=" --num_epochs=$EPOCHS" +CMD+=" --output_dir=$RESULT_DIR" +CMD+=" --model_toml=$MODEL_CONFIG" +CMD+=" --lr=$LEARNING_RATE" +CMD+=" --lr_warmup=$LEARNING_RATE_WARMUP" +CMD+=" --seed=$SEED" +CMD+=" --optimizer=adam" +CMD+=" --dataset_dir=$DATA_DIR" +CMD+=" --val_manifest=$DATA_DIR/librispeech-dev-clean-wav.json" +CMD+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json,$DATA_DIR/librispeech-train-clean-360-wav.json,$DATA_DIR/librispeech-train-other-500-wav.json" +CMD+=" --weight_decay=1e-3" +CMD+=" --save_freq=100" +CMD+=" --eval_freq=1" +CMD+=" --train_freq=250" +CMD+=" --lr_decay" +CMD+=" --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS " +CMD+=" $CHECKPOINT" +CMD+=" $PREC" +CMD+=" $CUDNN" + + +if [ "${LAUNCH_OPT}" != "none" ]; then + CMD="python -m $LAUNCH_OPT $CMD" +elif [ "$NUM_GPUS" -gt 1 ] ; then + CMD="python3 -m multiproc --nproc_per_node=$NUM_GPUS $CMD" +else + CMD="python3 $CMD" +fi + + +if [ "$CREATE_LOGFILE" = "true" ] ; then + export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS) + printf -v TAG "rnnt_train_%s_gbs%d" "$PRECISION" $GBS + DATESTAMP=`date +'%y%m%d%H%M%S'` + LOGFILE=$RESULT_DIR/$TAG.$DATESTAMP.log + printf "Logs written to %s\n" "$LOGFILE" +fi + +set -x +if [ -z "$LOGFILE" ] ; then + $CMD +else + ( + $CMD + ) |& tee $LOGFILE +fi +set +x diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/train_benchmark.sh b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/train_benchmark.sh new file mode 100644 index 00000000000..7b5a33705ca --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/scripts/train_benchmark.sh @@ -0,0 +1,130 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash + +echo "Container nvidia build = " $NVIDIA_BUILD_ID + +DATA_DIR=${1:-"/datasets/LibriSpeech"} +MODEL_CONFIG=${2:-"configs/jasper10x5dr_sp_offline_specaugment.toml"} +RESULT_DIR=${3:-"/results"} +CREATE_LOGFILE=${4:-"true"} +CUDNN_BENCHMARK=${5:-"true"} +NUM_GPUS=${6:-8} +PRECISION=${7:-"fp16"} +NUM_STEPS=${8:-"-1"} +MAX_DURATION=${9:-16.7} +SEED=${10:-0} +BATCH_SIZE=${11:-64} +LEARNING_RATE=${12:-"0.015"} +GRADIENT_ACCUMULATION_STEPS=${13:-1} +PRINT_FREQUENCY=${14:-1} + + +PREC="" +if [ "$PRECISION" = "fp16" ] ; then + PREC=" --fp16" +elif [ "$PRECISION" = "fp32" ] ; then + PREC="" +else + echo "Unknown argument" + exit -2 +fi + +STEPS="" +if [ "$NUM_STEPS" -ne "-1" ] ; then + STEPS=" --num_steps=$NUM_STEPS" +elif [ "$NUM_STEPS" = "-1" ] ; then + STEPS="" +else + echo "Unknown argument" + exit -2 +fi + +CUDNN="" +if [ "$CUDNN_BENCHMARK" = "true" ] ; then + CUDNN=" --cudnn" +else + CUDNN="" +fi + + +CMD=" train.py" +CMD+=" --batch_size=$BATCH_SIZE" +CMD+=" --num_epochs=400" +CMD+=" --output_dir=$RESULT_DIR" +CMD+=" --model_toml=$MODEL_CONFIG" +CMD+=" --lr=$LEARNING_RATE" +CMD+=" --seed=$SEED" +CMD+=" --optimizer=novograd" +CMD+=" --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS" +CMD+=" --dataset_dir=$DATA_DIR" +CMD+=" --val_manifest=$DATA_DIR/librispeech-dev-clean-wav.json" +CMD+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json,$DATA_DIR/librispeech-train-clean-360-wav.json,$DATA_DIR/librispeech-train-other-500-wav.json" +CMD+=" --weight_decay=1e-3" +CMD+=" --save_freq=100000" +CMD+=" --eval_freq=100000" +CMD+=" --max_duration=$MAX_DURATION" +CMD+=" --pad_to_max" +CMD+=" --train_freq=$PRINT_FREQUENCY" +CMD+=" --lr_decay" +CMD+=" $CUDNN" +CMD+=" $PREC" +CMD+=" $STEPS" + +if [ "$NUM_GPUS" -gt 1 ] ; then + CMD="python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS $CMD" +else + CMD="python3 $CMD" +fi + + +if [ "$CREATE_LOGFILE" = "true" ] ; then + export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS) + printf -v TAG "jasper_train_benchmark_%s_gbs%d" "$PRECISION" $GBS + DATESTAMP=`date +'%y%m%d%H%M%S'` + LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log" + printf "Logs written to %s\n" "$LOGFILE" + +fi + +if [ -z "$LOGFILE" ] ; then + + set -x + $CMD + set +x +else + + set -x + ( + $CMD + ) |& tee "$LOGFILE" + + set +x + + mean_latency=`cat "$LOGFILE" | grep 'Step time' | awk '{print $3}' | tail -n +2 | egrep -o '[0-9.]+'| awk 'BEGIN {total=0} {total+=$1} END {printf("%.2f\n",total/NR)}'` + mean_throughput=`python -c "print($BATCH_SIZE*$NUM_GPUS/${mean_latency})"` + training_wer_per_pgu=`cat "$LOGFILE" | grep 'training_batch_WER'| awk '{print $2}' | tail -n 1 | egrep -o '[0-9.]+'` + training_loss_per_pgu=`cat "$LOGFILE" | grep 'Loss@Step'| awk '{print $4}' | tail -n 1 | egrep -o '[0-9.]+'` + final_eval_wer=`cat "$LOGFILE" | grep 'Evaluation WER'| tail -n 1 | egrep -o '[0-9.]+'` + final_eval_loss=`cat "$LOGFILE" | grep 'Evaluation Loss'| tail -n 1 | egrep -o '[0-9.]+'` + + echo "max duration: $MAX_DURATION s" | tee -a "$LOGFILE" + echo "mean_latency: $mean_latency s" | tee -a "$LOGFILE" + echo "mean_throughput: $mean_throughput sequences/s" | tee -a "$LOGFILE" + echo "training_wer_per_pgu: $training_wer_per_pgu" | tee -a "$LOGFILE" + echo "training_loss_per_pgu: $training_loss_per_pgu" | tee -a "$LOGFILE" + echo "final_eval_loss: $final_eval_loss" | tee -a "$LOGFILE" + echo "final_eval_wer: $final_eval_wer" | tee -a "$LOGFILE" +fi diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/__init__.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/convert_librispeech.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/convert_librispeech.py new file mode 100644 index 00000000000..4b7c84a4fe2 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/convert_librispeech.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse +import os +import glob +import multiprocessing +import json + +import pandas as pd + +from preprocessing_utils import parallel_preprocess + +parser = argparse.ArgumentParser(description='Preprocess LibriSpeech.') +parser.add_argument('--input_dir', type=str, required=True, + help='LibriSpeech collection input dir') +parser.add_argument('--dest_dir', type=str, required=True, + help='Output dir') +parser.add_argument('--output_json', type=str, default='./', + help='name of the output json file.') +parser.add_argument('-s', '--speed', type=float, nargs='*', + help='Speed perturbation ratio') +parser.add_argument('--target_sr', type=int, default=None, + help='Target sample rate. ' + 'defaults to the input sample rate') +parser.add_argument('--overwrite', action='store_true', + help='Overwrite file if exists') +parser.add_argument('--parallel', type=int, default=multiprocessing.cpu_count(), + help='Number of threads to use when processing audio files') +args = parser.parse_args() + +args.input_dir = args.input_dir.rstrip('/') +args.dest_dir = args.dest_dir.rstrip('/') + + +def build_input_arr(input_dir): + txt_files = glob.glob(os.path.join(input_dir, '**', '*.trans.txt'), + recursive=True) + input_data = [] + for txt_file in txt_files: + rel_path = os.path.relpath(txt_file, input_dir) + with open(txt_file) as fp: + for line in fp: + fname, _, transcript = line.partition(' ') + input_data.append(dict(input_relpath=os.path.dirname(rel_path), + input_fname=fname + '.flac', + transcript=transcript)) + return input_data + + +print("[%s] Scanning input dir..." % args.output_json) +dataset = build_input_arr(input_dir=args.input_dir) + +print("[%s] Converting audio files..." % args.output_json) +dataset = parallel_preprocess(dataset=dataset, + input_dir=args.input_dir, + dest_dir=args.dest_dir, + target_sr=args.target_sr, + speed=args.speed, + overwrite=args.overwrite, + parallel=args.parallel) + +print("[%s] Generating json..." % args.output_json) +df = pd.DataFrame(dataset, dtype=object) + +# Save json with python. df.to_json() produces back slashed in file paths +dataset = df.to_dict(orient='records') +with open(args.output_json, 'w') as fp: + json.dump(dataset, fp, indent=2) diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/download_librispeech.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/download_librispeech.py new file mode 100644 index 00000000000..9cb26122159 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/download_librispeech.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import argparse +import pandas as pd + +from download_utils import download_file, md5_checksum, extract + +parser = argparse.ArgumentParser( + description='Download, verify and extract dataset files') +parser.add_argument('csv', type=str, + help='CSV file with urls and checksums to download.') +parser.add_argument('dest', type=str, + help='Download destnation folder.') +parser.add_argument('-e', type=str, default=None, + help='Extraction destnation folder. Defaults to download folder if not provided') +parser.add_argument('--skip_download', action='store_true', + help='Skip downloading the files') +parser.add_argument('--skip_checksum', action='store_true', + help='Skip checksum') +parser.add_argument('--skip_extract', action='store_true', + help='Skip extracting files') +args = parser.parse_args() +args.e = args.e or args.dest + + +df = pd.read_csv(args.csv, delimiter=',') + + +if not args.skip_download: + for url in df.url: + fname = url.split('/')[-1] + print("Downloading %s:" % fname) + download_file(url=url, dest_folder=args.dest, fname=fname) +else: + print("Skipping file download") + + +if not args.skip_checksum: + for index, row in df.iterrows(): + url = row['url'] + md5 = row['md5'] + fname = url.split('/')[-1] + fpath = os.path.join(args.dest, fname) + print("Verifying %s: " % fname, end='') + ret = md5_checksum(fpath=fpath, target_hash=md5) + if not ret: + raise ValueError(f"Checksum for {fname} failed!") + else: + print(f"Checksum correct for {fname}") +else: + print("Skipping checksum") + + +if not args.skip_extract: + for url in df.url: + fname = url.split('/')[-1] + fpath = os.path.join(args.dest, fname) + print("Decompressing %s:" % fpath) + extract(fpath=fpath, dest_folder=args.e) +else: + print("Skipping file extraction") diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/download_utils.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/download_utils.py new file mode 100644 index 00000000000..6dbc3cf17a2 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/download_utils.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import hashlib +import requests +import os +import tarfile +import tqdm + + +def download_file(url, dest_folder, fname, overwrite=False): + fpath = os.path.join(dest_folder, fname) + if os.path.isfile(fpath): + if overwrite: + print("Overwriting existing file") + else: + print("File exists, skipping download.") + return + + tmp_fpath = fpath + '.tmp' + + r = requests.get(url, stream=True) + file_size = int(r.headers['Content-Length']) + chunk_size = 1024 * 1024 # 1MB + total_chunks = int(file_size / chunk_size) + + with open(tmp_fpath, 'wb') as fp: + content_iterator = r.iter_content(chunk_size=chunk_size) + chunks = tqdm.tqdm(content_iterator, total=total_chunks, + unit='MB', desc=fpath, leave=True) + for chunk in chunks: + fp.write(chunk) + + os.rename(tmp_fpath, fpath) + + +def md5_checksum(fpath, target_hash): + file_hash = hashlib.md5() + with open(fpath, "rb") as fp: + for chunk in iter(lambda: fp.read(1024 * 1024), b""): + file_hash.update(chunk) + return file_hash.hexdigest() == target_hash + + +def extract(fpath, dest_folder): + if fpath.endswith('.tar.gz'): + mode = 'r:gz' + elif fpath.endswith('.tar'): + mode = 'r:' + else: + raise IOError('fpath has unknown extension: %s' % fpath) + + with tarfile.open(fpath, mode) as tar: + members = tar.getmembers() + for member in tqdm.tqdm(iterable=members, total=len(members), leave=True): + tar.extract(path=dest_folder, member=member) diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/preprocessing_utils.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/preprocessing_utils.py new file mode 100644 index 00000000000..e32dfd8bc64 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch/utils/preprocessing_utils.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import multiprocessing +import functools + +import sox + + +from tqdm import tqdm + + +def preprocess(data, input_dir, dest_dir, target_sr=None, speed=None, + overwrite=True): + speed = speed or [] + speed.append(1) + speed = list(set(speed)) # Make unique + + input_fname = os.path.join(input_dir, + data['input_relpath'], + data['input_fname']) + input_sr = sox.file_info.sample_rate(input_fname) + target_sr = target_sr or input_sr + + os.makedirs(os.path.join(dest_dir, data['input_relpath']), exist_ok=True) + + output_dict = {} + output_dict['transcript'] = data['transcript'].lower().strip() + output_dict['files'] = [] + + fname = os.path.splitext(data['input_fname'])[0] + for s in speed: + output_fname = fname + \ + '{}.wav'.format('' if s == 1 else '-{}'.format(s)) + output_fpath = os.path.join(dest_dir, + data['input_relpath'], + output_fname) + + if not os.path.exists(output_fpath) or overwrite: + cbn = sox.Transformer().speed(factor=s).convert(target_sr) + cbn.build(input_fname, output_fpath) + + file_info = sox.file_info.info(output_fpath) + file_info['fname'] = os.path.join(os.path.basename(dest_dir), + data['input_relpath'], + output_fname) + file_info['speed'] = s + output_dict['files'].append(file_info) + + if s == 1: + file_info = sox.file_info.info(output_fpath) + output_dict['original_duration'] = file_info['duration'] + output_dict['original_num_samples'] = file_info['num_samples'] + + return output_dict + + +def parallel_preprocess(dataset, input_dir, dest_dir, target_sr, speed, overwrite, parallel): + with multiprocessing.Pool(parallel) as p: + func = functools.partial(preprocess, + input_dir=input_dir, dest_dir=dest_dir, + target_sr=target_sr, speed=speed, overwrite=overwrite) + dataset = list(tqdm(p.imap(func, dataset), total=len(dataset))) + return dataset diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch_SUT.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch_SUT.py new file mode 100644 index 00000000000..615df2abbc5 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/pytorch_SUT.py @@ -0,0 +1,104 @@ +# Copyright (c) 2020, Cerebras Systems, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os +sys.path.insert(0, os.path.join(os.getcwd(), "pytorch")) + +import array +import torch +import numpy as np +import toml +import mlperf_loadgen as lg +from tqdm import tqdm + +from QSL import AudioQSL, AudioQSLInMemory +from decoders import ScriptGreedyDecoder +from helpers import add_blank_label +from preprocessing import AudioPreprocessing +from model_separable_rnnt import RNNT + + +def load_and_migrate_checkpoint(ckpt_path): + checkpoint = torch.load(ckpt_path, map_location="cpu") + migrated_state_dict = {} + for key, value in checkpoint['state_dict'].items(): + key = key.replace("joint_net", "joint.net") + migrated_state_dict[key] = value + del migrated_state_dict["audio_preprocessor.featurizer.fb"] + del migrated_state_dict["audio_preprocessor.featurizer.window"] + return migrated_state_dict + + +class PytorchSUT: + def __init__(self, config_toml, checkpoint_path, dataset_dir, + manifest_filepath, perf_count): + config = toml.load(config_toml) + + dataset_vocab = config['labels']['labels'] + rnnt_vocab = add_blank_label(dataset_vocab) + featurizer_config = config['input_eval'] + + self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries) + self.qsl = AudioQSLInMemory(dataset_dir, + manifest_filepath, + dataset_vocab, + featurizer_config["sample_rate"], + perf_count) + self.audio_preprocessor = AudioPreprocessing(**featurizer_config) + self.audio_preprocessor.eval() + self.audio_preprocessor = torch.jit.script(self.audio_preprocessor) + self.audio_preprocessor = torch.jit._recursive.wrap_cpp_module( + torch._C._freeze_module(self.audio_preprocessor._c)) + + model = RNNT( + feature_config=featurizer_config, + rnnt=config['rnnt'], + num_classes=len(rnnt_vocab) + ) + model.load_state_dict(load_and_migrate_checkpoint(checkpoint_path), + strict=True) + model.eval() + self.greedy_decoder = ScriptGreedyDecoder(len(rnnt_vocab) - 1, model) + + def issue_queries(self, query_samples): + for query_sample in query_samples: + waveform = self.qsl[query_sample.index] + assert waveform.ndim == 1 + waveform_length = np.array(waveform.shape[0], dtype=np.int64) + waveform = np.expand_dims(waveform, 0) + waveform_length = np.expand_dims(waveform_length, 0) + with torch.no_grad(): + waveform = torch.from_numpy(waveform) + waveform_length = torch.from_numpy(waveform_length) + feature, feature_length = self.audio_preprocessor.forward((waveform, waveform_length)) + assert feature.ndim == 3 + assert feature_length.ndim == 1 + feature = feature.permute(2, 0, 1) + + _, _, transcript = self.greedy_decoder.forward(feature, feature_length) + + assert len(transcript) == 1 + response_array = array.array('q', transcript[0]) + bi = response_array.buffer_info() + response = lg.QuerySampleResponse(query_sample.id, bi[0], + bi[1] * response_array.itemsize) + lg.QuerySamplesComplete([response]) + + def flush_queries(self): + pass + + def __del__(self): + lg.DestroySUT(self.sut) + print("Finished destroying SUT.") \ No newline at end of file diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/requirements.txt b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/requirements.txt new file mode 100644 index 00000000000..d4db5294aa4 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/requirements.txt @@ -0,0 +1,10 @@ +neural-compressor +sox +absl-py +toml +text-unidecode +inflect +librosa==0.8.1 +torch <= 1.13.1 +tqdm +numpy <= 1.22.4 \ No newline at end of file diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run.sh b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run.sh new file mode 100644 index 00000000000..7538df99bdb --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run.sh @@ -0,0 +1,90 @@ +#/bin/bash + +set -euo pipefail + +work_dir=/export/b07/ws15dgalvez/mlperf-rnnt-librispeech +local_data_dir=$work_dir/local_data +librispeech_download_dir=$local_data_dir/LibriSpeech +stage=3 + +mkdir -p $work_dir $local_data_dir $librispeech_download_dir + +install_dir=third_party/install +mkdir -p $install_dir +install_dir=$(readlink -f $install_dir) + +set +u +source "$($CONDA_EXE info --base)/etc/profile.d/conda.sh" +set -u + +# stage -1: install dependencies +if [[ $stage -le -1 ]]; then + conda env create --force -v --file environment.yml + + set +u + source "$(conda info --base)/etc/profile.d/conda.sh" + conda activate mlperf-rnnt + set -u + + # We need to convert .flac files to .wav files via sox. Not all sox installs have flac support, so we install from source. + wget https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.2.tar.xz -O third_party/flac-1.3.2.tar.xz + (cd third_party; tar xf flac-1.3.2.tar.xz; cd flac-1.3.2; ./configure --prefix=$install_dir && make && make install) + + wget https://sourceforge.net/projects/sox/files/sox/14.4.2/sox-14.4.2.tar.gz -O third_party/sox-14.4.2.tar.gz + (cd third_party; tar zxf sox-14.4.2.tar.gz; cd sox-14.4.2; LDFLAGS="-L${install_dir}/lib" CFLAGS="-I${install_dir}/include" ./configure --prefix=$install_dir --with-flac && make && make install) + + (cd $(git rev-parse --show-toplevel)/loadgen; python setup.py install) +fi + +export PATH="$install_dir/bin/:$PATH" + +set +u +conda activate mlperf-rnnt +set -u + +# stage 0: download model. Check checksum to skip? +if [[ $stage -le 0 ]]; then + wget https://zenodo.org/record/3662521/files/DistributedDataParallel_1576581068.9962234-epoch-100.pt?download=1 -O $work_dir/rnnt.pt +fi + +# stage 1: download data. This will hae a non-zero exit code if the +# checksum is incorrect. +if [[ $stage -le 1 ]]; then + python pytorch/utils/download_librispeech.py \ + pytorch/utils/librispeech-inference.csv \ + $librispeech_download_dir \ + -e $local_data_dir +fi + +if [[ $stage -le 2 ]]; then + python pytorch/utils/convert_librispeech.py \ + --input_dir $librispeech_download_dir/dev-clean \ + --dest_dir $local_data_dir/dev-clean-wav \ + --output_json $local_data_dir/dev-clean-wav.json +fi + +if [[ $stage -le 3 ]]; then + for backend in pytorch; do + for accuracy in "--accuracy" ""; do + for scenario in SingleStream Offline Server; do + log_dir=${work_dir}/${scenario}_${backend} + if [ ! -z ${accuracy} ]; then + log_dir+=_accuracy + fi + log_dir+=rerun + + python run.py --backend pytorch \ + --dataset_dir $local_data_dir \ + --manifest $local_data_dir/dev-clean-wav.json \ + --pytorch_config_toml pytorch/configs/rnnt.toml \ + --pytorch_checkpoint $work_dir/rnnt.pt \ + --scenario ${scenario} \ + --backend ${backend} \ + --log_dir ${log_dir} \ + ${accuracy} & + + done + done + done + wait +fi diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run_benchmark.sh b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run_benchmark.sh new file mode 100644 index 00000000000..02c20b21e8e --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run_benchmark.sh @@ -0,0 +1,84 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + tuned_checkpoint=saved_results + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo ${var} |cut -f2 -d=) + ;; + --int8=*) + int8=$(echo ${var} |cut -f2 -d=) + ;; + --config=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done +} + +# run_benchmark +function run_benchmark { + + if [[ ${mode} == "accuracy" ]]; then + mode_cmd="--accuracy " + elif [[ ${mode} == "performance" ]]; then + mode_cmd="--performance " + else + echo "Error: No such mode: ${mode}" + exit 1 + fi + + extra_cmd="" + if [ -n "$dataset_location" ];then + extra_cmd=$extra_cmd"--dataset_dir ${dataset_location} " + fi + if [ -n "$input_model" ];then + extra_cmd=$extra_cmd"--pytorch_checkpoint ${input_model} " + fi + if [ -n "$tuned_checkpoint" ];then + extra_cmd=$extra_cmd"--tuned_checkpoint ${tuned_checkpoint} " + fi + if [[ ${int8} == "true" ]]; then + extra_cmd=$extra_cmd"--int8" + fi + + python run_tune.py \ + --backend pytorch \ + --manifest $dataset_location/dev-clean-wav.json \ + --pytorch_config_toml pytorch/configs/rnnt.toml \ + --scenario SingleStream \ + ${mode_cmd} \ + ${extra_cmd} +} + +main "$@" diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run_quant.sh b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run_quant.sh new file mode 100644 index 00000000000..63f0b6d9231 --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run_quant.sh @@ -0,0 +1,59 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + tuned_checkpoint=saved_results + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done +} + +# run_tuning +function run_tuning { + extra_cmd="" + if [ -n "$dataset_location" ];then + extra_cmd=$extra_cmd"--dataset_dir ${dataset_location} " + fi + if [ -n "$input_model" ];then + extra_cmd=$extra_cmd"--pytorch_checkpoint ${input_model} " + fi + if [ -n "$tuned_checkpoint" ];then + extra_cmd=$extra_cmd"--tuned_checkpoint ${tuned_checkpoint} " + fi + + python run_tune.py \ + --tune \ + --backend pytorch \ + --manifest $dataset_location/dev-clean-wav.json \ + --pytorch_config_toml pytorch/configs/rnnt.toml \ + --scenario Offline \ + ${extra_cmd} +} + +main "$@" \ No newline at end of file diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run_tune.py b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run_tune.py new file mode 100644 index 00000000000..4400ffa179d --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/run_tune.py @@ -0,0 +1,151 @@ +# Copyright 2020 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import argparse +import mlperf_loadgen as lg +import subprocess + +import time +import os +from pathlib import Path +import re + +MLPERF_CONF = Path(os.path.dirname(os.path.realpath(__file__))) / "./mlperf.conf" +MLPERF_CONF = MLPERF_CONF.resolve() + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--tune', dest='tune', action='store_true', + help='tune best int8 model on calibration dataset') + parser.add_argument("--backend", choices=["pytorch"], default="pytorch", help="Backend") + parser.add_argument("--scenario", choices=["SingleStream", "Offline", "Server"], + default="Offline", help="Scenario") + parser.add_argument("--mlperf_conf", default=str(MLPERF_CONF), help="mlperf rules config") + parser.add_argument("--user_conf", default="user.conf", + help="user config for user LoadGen settings such as target QPS") + parser.add_argument("--pytorch_config_toml", default="pytorch/configs/rnnt.toml") + parser.add_argument("--pytorch_checkpoint", default="pytorch/work_dir/rnnt.pt") + parser.add_argument("--dataset_dir", required=True) + parser.add_argument("--manifest", required=True) + parser.add_argument("--perf_count", type=int, default=None) + parser.add_argument("--log_dir", default='./saved_log') + parser.add_argument('--performance', dest='performance', action='store_true', + help='run benchmark') + parser.add_argument("--accuracy", dest='accuracy', action='store_true', + help='For accuracy measurement only.') + parser.add_argument('--int8', dest='int8', action='store_true', help='load int8 model') + parser.add_argument("--tuned_checkpoint", default='./saved_results', type=str, metavar='PATH', + help='path to checkpoint tuned by Neural Compressor (default: ./)') + args = parser.parse_args() + return args + + +scenario_map = { + "SingleStream": lg.TestScenario.SingleStream, + "Offline": lg.TestScenario.Offline, + "Server": lg.TestScenario.Server, +} + + +def main(): + args = get_args() + + if args.backend == "pytorch": + from pytorch_SUT import PytorchSUT + sut = PytorchSUT(args.pytorch_config_toml, args.pytorch_checkpoint, + args.dataset_dir, args.manifest, args.perf_count) + model = sut.greedy_decoder._model + else: + raise ValueError("Unknown backend: {:}".format(args.backend)) + + settings = lg.TestSettings() + settings.scenario = scenario_map[args.scenario] + settings.FromConfig(args.mlperf_conf, "rnnt", args.scenario) + settings.FromConfig(args.user_conf, "rnnt", args.scenario) + + if args.performance: + settings.mode = lg.TestMode.PerformanceOnly + else: + settings.mode = lg.TestMode.AccuracyOnly + + log_path = args.log_dir + os.makedirs(log_path, exist_ok=True) + log_output_settings = lg.LogOutputSettings() + log_output_settings.outdir = log_path + log_output_settings.copy_summary_to_stdout = True + log_settings = lg.LogSettings() + log_settings.log_output = log_output_settings + + pattern = ['accuracy=\d+.\d+', 'samples_per_query : \d+', 'Mean latency.*'] + + def eval_func(model): + print("Running Loadgen test...") + sut.greedy_decoder._model = model + lg.StartTestWithLogSettings(sut.sut, sut.qsl.qsl, settings, log_settings) + cmd = f"python3 accuracy_eval.py --log_dir {log_path} \ + --dataset_dir {args.dataset_dir} --manifest {args.manifest}" + out = subprocess.check_output(cmd, shell=True) + out = out.decode() + regex_accu = re.compile(pattern[0]) + accu = float(regex_accu.findall(out)[0].split('=')[1]) + print('Accuracy: %.3f ' % (accu)) + return accu + + def benchmark(model): + print("Running Loadgen test...") + sut.greedy_decoder._model = model + lg.StartTestWithLogSettings(sut.sut, sut.qsl.qsl, settings, log_settings) + file_path = os.path.join(log_path, 'mlperf_log_summary.txt') + f = open(file_path, 'r', encoding='UTF-8') + file_content = f.read() + f.close() + regex_batch = re.compile(pattern[1]) + regex_late = re.compile(pattern[2]) + samples_per_query = int(regex_batch.findall(file_content)[0].split(': ')[1]) + latency_per_sample = int(regex_late.findall(file_content)[0].split(': ')[1]) + print('Batch size = %d' % samples_per_query) + print('Latency: %.3f ms' % (latency_per_sample / 10**6)) + print('Throughput: %.3f samples/sec' % (10**9/latency_per_sample)) + + if args.tune: + from neural_compressor import PostTrainingQuantConfig + from neural_compressor import quantization + conf = PostTrainingQuantConfig(approach="dynamic") + q_model = quantization.fit(model, + conf, + eval_func=eval_func) + q_model.save(args.tuned_checkpoint) + return + + elif args.int8: + from neural_compressor.utils.pytorch import load + int8_model = load(os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model) + if args.accuracy: + eval_func(int8_model) + elif args.performance: + benchmark(int8_model) + else: + if args.accuracy: + eval_func(model) + elif args.performance: + benchmark(model) + + + print("Done!", flush=True) + + +if __name__ == "__main__": + main() diff --git a/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/user.conf b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/user.conf new file mode 100644 index 00000000000..38057aaeade --- /dev/null +++ b/examples/pytorch/speech_recognition/rnnt/quantization/ptq_dynamic/fx/user.conf @@ -0,0 +1,6 @@ +# Please set these fields depending on the performance of your system to +# override default LoadGen settings. +*.SingleStream.target_latency = 10 +*.MultiStream.target_latency = 80 +*.Server.target_qps = 1.0 +*.Offline.target_qps = 1.0 \ No newline at end of file