From 68c5d60d8816ae202f19ec0dba063d1b322bef9a Mon Sep 17 00:00:00 2001
From: andreii <andreii@nvidia.com>
Date: Thu, 25 Mar 2021 16:29:45 -0700
Subject: [PATCH 1/5] Implementation of the use of DALI.

---
 scripts/classification/imagenet/dali.py       | 197 +++++++++++++++
 scripts/classification/imagenet/test.sh       |  35 +++
 .../classification/imagenet/train_imagenet.py | 239 +++++++++---------
 3 files changed, 358 insertions(+), 113 deletions(-)
 create mode 100644 scripts/classification/imagenet/dali.py
 create mode 100755 scripts/classification/imagenet/test.sh

diff --git a/scripts/classification/imagenet/dali.py b/scripts/classification/imagenet/dali.py
new file mode 100644
index 0000000000..cd42c41550
--- /dev/null
+++ b/scripts/classification/imagenet/dali.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from nvidia.dali.pipeline import Pipeline
+import nvidia.dali.ops as ops
+import nvidia.dali.types as types
+from nvidia.dali.plugin.mxnet import DALIClassificationIterator, LastBatchPolicy
+import horovod.mxnet as hvd
+
+
+def add_dali_args(parser):
+    group = parser.add_argument_group('DALI data backend', 'entire group applies only to dali data backend')
+    group.add_argument('--dali-separ-val', action='store_true',
+                      help='each process will perform independent validation on whole val-set')
+    group.add_argument('--dali-threads', type=int, default=3, help="number of threads" +\
+                       "per GPU for DALI")
+    group.add_argument('--dali-validation-threads', type=int, default=10, help="number of threads" +\
+                       "per GPU for DALI for validation")
+    group.add_argument('--dali-prefetch-queue', type=int, default=2, help="DALI prefetch queue depth")
+    group.add_argument('--dali-nvjpeg-memory-padding', type=int, default=64, help="Memory padding value for nvJPEG (in MB)")
+    group.add_argument('--dali-fuse-decoder', type=int, default=1, help="0 or 1 whether to fuse decoder or not")
+    group.add_argument('--flag', type=int, default=1, help="Flag")
+    return parser
+
+def add_data_args(parser):
+    def int_list(x):
+        return list(map(int, x.split(',')))
+
+    data = parser.add_argument_group('Data')
+    data.add_argument('--data-pred', type=str, help='the image on which run inference (only for pred mode)')
+    data.add_argument('--image-shape', type=int_list, default=[3, 224, 224],
+                      help='the image shape feed into the network')
+
+    data.add_argument('--input-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'),
+                      help='the layout of the input data')
+    data.add_argument('--conv-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'),
+                      help='the layout of the data assumed by the conv operation')
+    data.add_argument('--batchnorm-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'),
+                      help='the layout of the data assumed by the batchnorm operation')
+    data.add_argument('--pooling-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'),
+                      help='the layout of the data assumed by the pooling operation')
+
+    data.add_argument('--num-examples', type=int, default=1281167,
+                      help="the number of training examples (doesn't work with mxnet data backend)")
+    data.add_argument('--data-val-resize', type=int, default=256,
+                      help='base length of shorter edge for validation dataset')
+    data.add_argument('--kv-store', type=str, default='device', choices=('device', 'horovod'),
+                      help='key-value store type')
+    return data
+
+
+def get_device_names(dali_cpu):
+    return ("cpu", "cpu") if dali_cpu else ("gpu", "mixed")
+
+
+class HybridTrainPipe(Pipeline):
+    def __init__(self, args, batch_size, num_threads, device_id, rec_path, idx_path,
+                 shard_id, num_shards, crop_shape, nvjpeg_padding, prefetch_queue=3,
+                 output_layout=types.NCHW, pad_output=True, dtype='float16', dali_cpu=False):
+        super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id, prefetch_queue_depth = prefetch_queue)
+        self.input = ops.MXNetReader(path=[rec_path], index_path=[idx_path],
+                                     random_shuffle=True, shard_id=shard_id, num_shards=num_shards)
+
+        dali_device, decoder_device = get_device_names(dali_cpu)
+        if args.dali_fuse_decoder:
+            self.decode = ops.ImageDecoderRandomCrop(device=decoder_device, output_type=types.RGB,
+                                                    device_memory_padding=nvjpeg_padding, host_memory_padding=nvjpeg_padding)
+        else:
+            self.decode = ops.ImageDecoder(device=decoder_device, output_type=types.RGB,
+                                           device_memory_padding=nvjpeg_padding, host_memory_padding=nvjpeg_padding)
+
+        if args.dali_fuse_decoder:
+            self.resize = ops.Resize(device=dali_device, resize_x=crop_shape[1], resize_y=crop_shape[0])
+        else:
+            self.resize = ops.RandomResizedCrop(device=dali_device, size=crop_shape)
+
+        self.cmnp = ops.CropMirrorNormalize(device="gpu",
+                                            dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT,
+                                            output_layout=output_layout, crop=crop_shape, pad_output=pad_output,
+                                            mean=args.rgb_mean, std=args.rgb_std)
+        self.coin = ops.random.CoinFlip(probability=0.5)
+
+    def define_graph(self):
+        rng = self.coin()
+        self.jpegs, self.labels = self.input(name="Reader")
+
+        images = self.decode(self.jpegs)
+        images = self.resize(images)
+        output = self.cmnp(images.gpu(), mirror=rng)
+        return [output, self.labels]
+
+
+class HybridValPipe(Pipeline):
+    def __init__(self, args, batch_size, num_threads, device_id, rec_path, idx_path,
+                 shard_id, num_shards, crop_shape, nvjpeg_padding, prefetch_queue=3, resize_shp=None,
+                 output_layout=types.NCHW, pad_output=True, dtype='float16', dali_cpu=False):
+        super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id, prefetch_queue_depth=prefetch_queue)
+        self.input = ops.MXNetReader(path=[rec_path], index_path=[idx_path],
+                                     random_shuffle=False, shard_id=shard_id, num_shards=num_shards)
+
+        dali_device, decoder_device = get_device_names(dali_cpu)
+        self.decode = ops.ImageDecoder(device=decoder_device, output_type=types.RGB,
+                                       device_memory_padding=nvjpeg_padding,
+                                       host_memory_padding=nvjpeg_padding)
+        self.resize = ops.Resize(device=dali_device, resize_shorter=resize_shp) if resize_shp else None
+        self.cmnp = ops.CropMirrorNormalize(device="gpu",
+                                            dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT,
+                                            output_layout=output_layout, crop=crop_shape, pad_output=pad_output,
+                                            mean=args.rgb_mean, std=args.rgb_std)
+
+    def define_graph(self):
+        self.jpegs, self.labels = self.input(name="Reader")
+        images = self.decode(self.jpegs)
+        if self.resize:
+            images = self.resize(images)
+        output = self.cmnp(images.gpu())
+        return [output, self.labels]
+
+
+def get_rec_iter(args, kv=None, batch_fn=None, dali_cpu=False):
+    devices = [0] if dali_cpu else args.gpus
+    num_devices = len(devices)
+    pad_output = (args.image_shape[0] == 4)
+
+    # the input_layout w.r.t. the model is the output_layout of the image pipeline
+    output_layout = types.NHWC if args.input_layout == 'NHWC' else types.NCHW
+
+    if 'horovod' in args.kv_store:
+        rank = hvd.rank()
+        nWrk = hvd.size()
+    else:
+        rank = kv.rank if kv else 0
+        nWrk = kv.num_workers if kv else 1
+
+    batch_size = args.batch_size // nWrk * num_devices
+
+    trainpipes = [HybridTrainPipe(args           = args,
+                                  batch_size     = batch_size,
+                                  num_threads    = args.dali_threads,
+                                  device_id      = dev_id,
+                                  rec_path       = args.rec_train,
+                                  idx_path       = args.rec_train_idx,
+                                  shard_id       = devices.index(dev_id) + num_devices*rank,
+                                  num_shards     = num_devices*nWrk,
+                                  crop_shape     = args.image_shape[1:],
+                                  output_layout  = output_layout,
+                                  dtype          = args.dtype,
+                                  pad_output     = pad_output,
+                                  dali_cpu       = dali_cpu,
+                                  nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024,
+                                  prefetch_queue = args.dali_prefetch_queue) for dev_id in devices]
+    trainpipes[0].build()
+    num_examples = trainpipes[0].epoch_size("Reader")
+    if args.num_examples < num_examples:
+        warnings.warn("{} training examples will be used, although full training set contains {} examples".format(args.num_examples, num_examples))
+
+    train_examples = args.num_examples // nWrk
+    dali_train_iter = DALIClassificationIterator(trainpipes, train_examples)
+    if not args.rec_val:
+        return dali_train_iter, None, batch_fn
+
+    valpipes = [HybridValPipe(args           = args,
+                              batch_size     = batch_size,
+                              num_threads    = args.dali_validation_threads,
+                              device_id      = dev_id,
+                              rec_path       = args.rec_val,
+                              idx_path       = args.rec_val_idx,
+                              shard_id       = 0 if args.dali_separ_val else devices.index(dev_id) + num_devices*rank,
+                              num_shards     = 1 if args.dali_separ_val else num_devices*nWrk,
+                              crop_shape     = args.image_shape[1:],
+                              resize_shp     = args.data_val_resize,
+                              output_layout  = output_layout,
+                              dtype          = args.dtype,
+                              pad_output     = pad_output,
+                              dali_cpu       = dali_cpu,
+                              nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024,
+                              prefetch_queue = args.dali_prefetch_queue) for dev_id in devices]
+    valpipes[0].build()
+    worker_val_examples = valpipes[0].epoch_size("Reader")
+    if not args.dali_separ_val:
+        adj = 1 if rank < worker_val_examples % nWrk else 0
+        worker_val_examples = adj + worker_val_examples // nWrk
+
+    dali_val_iter = DALIClassificationIterator(valpipes, worker_val_examples)
+    return dali_train_iter, dali_val_iter, batch_fn
diff --git a/scripts/classification/imagenet/test.sh b/scripts/classification/imagenet/test.sh
new file mode 100755
index 0000000000..50e69f0221
--- /dev/null
+++ b/scripts/classification/imagenet/test.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+if [ -z "$MODEL" ]; then
+  export MODEL=resnet18_v1
+fi
+
+if [ -z "$NUM_TRAINING_SAMPLES" ]; then
+  export NUM_TRAINING_SAMPLES=1281167
+fi
+
+if [ -z "$NUM_EPOCHS" ]; then
+  export NUM_EPOCHS=3
+fi
+
+if [ -z "$NUM_GPUS" ] || [ $NUM_GPUS '-lt' 0 ]; then
+  export NUM_GPUS=0
+fi
+
+if [ -z "$DATA_BACKEND" ]; then
+  export DATA_BACKEND='mxnet'  # Options are: dali-gpu, dali-cpu, mxnet
+fi
+
+if [ -z "$TRAIN_DATA_DIR" ]; then
+  export TRAIN_DATA_DIR=~/.mxnet/datasets/imagenet
+  export TRAIN_DATA_DIR=/data/imagenet/train-480-val-256-recordio
+fi
+
+pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100
+
+python train_imagenet.py --model $MODEL --data-backend $DATA_BACKEND --num-gpus $NUM_GPUS \
+      --num-epochs $NUM_EPOCHS --num-training-samples $NUM_TRAINING_SAMPLES --use-rec \
+       --rec-train $TRAIN_DATA_DIR/train.rec --rec-train-idx $TRAIN_DATA_DIR/train.idx \
+       --rec-val $TRAIN_DATA_DIR/val.rec --rec-val-idx $TRAIN_DATA_DIR/val.idx --data-dir $TRAIN_DATA_DIR \
+
+
diff --git a/scripts/classification/imagenet/train_imagenet.py b/scripts/classification/imagenet/train_imagenet.py
index 7bd70f0023..99bc948e0f 100644
--- a/scripts/classification/imagenet/train_imagenet.py
+++ b/scripts/classification/imagenet/train_imagenet.py
@@ -2,7 +2,6 @@
 
 import numpy as np
 import mxnet as mx
-import gluoncv as gcv
 from mxnet import gluon, nd
 from mxnet import autograd as ag
 from mxnet.gluon.data.vision import transforms
@@ -13,18 +12,26 @@
 from gluoncv.model_zoo import get_model
 from gluoncv.utils import makedirs, LRSequential, LRScheduler
 
+import dali
+
 # CLI
 def parse_args():
+    def float_list(x):
+        return list(map(float, x.split(',')))
+
+    data_dir = '~/.mxnet/datasets/imagenet/'
     parser = argparse.ArgumentParser(description='Train a model for image classification.')
-    parser.add_argument('--data-dir', type=str, default='~/.mxnet/datasets/imagenet',
+    parser.add_argument('--data-backend', choices=('dali-gpu', 'dali-cpu', 'mxnet'), default='mxnet',
+                        help='set data loading & augmentation backend')
+    parser.add_argument('--data-dir', type=str, default=data_dir,
                         help='training and validation pictures to use.')
-    parser.add_argument('--rec-train', type=str, default='~/.mxnet/datasets/imagenet/rec/train.rec',
+    parser.add_argument('--rec-train', type=str, default=data_dir+'rec/train.rec',
                         help='the training data')
-    parser.add_argument('--rec-train-idx', type=str, default='~/.mxnet/datasets/imagenet/rec/train.idx',
+    parser.add_argument('--rec-train-idx', type=str, default=data_dir+'rec/train.idx',
                         help='the index of training data')
-    parser.add_argument('--rec-val', type=str, default='~/.mxnet/datasets/imagenet/rec/val.rec',
+    parser.add_argument('--rec-val', type=str, default=data_dir+'rec/val.rec',
                         help='the validation data')
-    parser.add_argument('--rec-val-idx', type=str, default='~/.mxnet/datasets/imagenet/rec/val.idx',
+    parser.add_argument('--rec-val-idx', type=str, default=data_dir+'rec/val.idx',
                         help='the index of validation data')
     parser.add_argument('--use-rec', action='store_true',
                         help='use image record iter for data input. default is false.')
@@ -104,11 +111,23 @@ def parse_args():
                         help='name of training log file')
     parser.add_argument('--use-gn', action='store_true',
                         help='whether to use group norm.')
-    opt = parser.parse_args()
-    return opt
+    parser.add_argument('--rgb-mean', type=float_list, default=[123.68, 116.779, 103.939],
+                        help='a tuple of size 3 for the mean rgb')
+    parser.add_argument('--rgb-std', type=float_list, default=[58.393, 57.12, 57.375],
+                        help='a tuple of size 3 for the std rgb')
+    parser.add_argument('--num-training-samples', type=int, default=1281167,
+                        help='Number of training samples')
+    parser = dali.add_dali_args(parser)
+    dali.add_data_args(parser)
+    return parser.parse_args()
 
 
 def main():
+    def batch_func(batch, ctx):
+        data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
+        label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
+        return data, label
+
     opt = parse_args()
 
     filehandler = logging.FileHandler(opt.logging_file)
@@ -121,14 +140,11 @@ def main():
 
     logger.info(opt)
 
-    batch_size = opt.batch_size
     classes = 1000
-    num_training_samples = 1281167
-
     num_gpus = opt.num_gpus
-    batch_size *= max(1, num_gpus)
+    batch_size = opt.batch_size * max(1, num_gpus)
     context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
-    num_workers = opt.num_workers
+    opt.gpus = [i for i in range(num_gpus)]
 
     lr_decay = opt.lr_decay
     lr_decay_period = opt.lr_decay_period
@@ -137,7 +153,7 @@ def main():
     else:
         lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')]
     lr_decay_epoch = [e - opt.warmup_epochs for e in lr_decay_epoch]
-    num_batches = num_training_samples // batch_size
+    num_batches = opt.num_training_samples // batch_size
 
     lr_scheduler = LRSequential([
         LRScheduler('linear', base_lr=0, target_lr=opt.lr,
@@ -173,47 +189,39 @@ def main():
         net.load_parameters(opt.resume_params, ctx = context)
 
     # teacher model for distillation training
-    if opt.teacher is not None and opt.hard_weight < 1.0:
-        teacher_name = opt.teacher
-        teacher = get_model(teacher_name, pretrained=True, classes=classes, ctx=context)
+    distillation = opt.teacher is not None and opt.hard_weight < 1.0
+    if distillation:
+        teacher = get_model(opt.teacher, pretrained=True, classes=classes, ctx=context)
         teacher.cast(opt.dtype)
-        distillation = True
-    else:
-        distillation = False
 
     # Two functions for reading data from record file or raw images
-    def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_size, num_workers):
-        rec_train = os.path.expanduser(rec_train)
-        rec_train_idx = os.path.expanduser(rec_train_idx)
-        rec_val = os.path.expanduser(rec_val)
-        rec_val_idx = os.path.expanduser(rec_val_idx)
+    def get_data_rec(args):
+        rec_train = os.path.expanduser(args.rec_train)
+        rec_train_idx = os.path.expanduser(args.rec_train_idx)
+        rec_val = os.path.expanduser(args.rec_val)
+        rec_val_idx = os.path.expanduser(args.rec_val_idx)
+        num_gpus = args.num_gpus
+        batch_size = args.batch_size * max(1, num_gpus)
+
         jitter_param = 0.4
         lighting_param = 0.1
         input_size = opt.input_size
         crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875
         resize = int(math.ceil(input_size / crop_ratio))
-        mean_rgb = [123.68, 116.779, 103.939]
-        std_rgb = [58.393, 57.12, 57.375]
-
-        def batch_fn(batch, ctx):
-            data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
-            label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
-            return data, label
 
         train_data = mx.io.ImageRecordIter(
             path_imgrec         = rec_train,
             path_imgidx         = rec_train_idx,
-            preprocess_threads  = num_workers,
+            preprocess_threads  = args.num_workers,
             shuffle             = True,
             batch_size          = batch_size,
-
             data_shape          = (3, input_size, input_size),
-            mean_r              = mean_rgb[0],
-            mean_g              = mean_rgb[1],
-            mean_b              = mean_rgb[2],
-            std_r               = std_rgb[0],
-            std_g               = std_rgb[1],
-            std_b               = std_rgb[2],
+            mean_r              = args.rgb_mean[0],
+            mean_g              = args.rgb_mean[1],
+            mean_b              = args.rgb_mean[2],
+            std_r               = args.rgb_std[0],
+            std_g               = args.rgb_std[1],
+            std_b               = args.rgb_std[2],
             rand_mirror         = True,
             random_resized_crop = True,
             max_aspect_ratio    = 4. / 3.,
@@ -228,22 +236,25 @@ def batch_fn(batch, ctx):
         val_data = mx.io.ImageRecordIter(
             path_imgrec         = rec_val,
             path_imgidx         = rec_val_idx,
-            preprocess_threads  = num_workers,
+            preprocess_threads  = args.num_workers,
             shuffle             = False,
             batch_size          = batch_size,
 
             resize              = resize,
             data_shape          = (3, input_size, input_size),
-            mean_r              = mean_rgb[0],
-            mean_g              = mean_rgb[1],
-            mean_b              = mean_rgb[2],
-            std_r               = std_rgb[0],
-            std_g               = std_rgb[1],
-            std_b               = std_rgb[2],
+            mean_r              = args.rgb_mean[0],
+            mean_g              = args.rgb_mean[1],
+            mean_b              = args.rgb_mean[2],
+            std_r               = args.rgb_std[0],
+            std_g               = args.rgb_std[1],
+            std_b               = args.rgb_std[2],
         )
-        return train_data, val_data, batch_fn
+        return train_data, val_data, batch_func
 
-    def get_data_loader(data_dir, batch_size, num_workers):
+    def get_data_rec_transfomed(args):
+        data_dir = args.data_dir
+        num_workers = args.num_workers
+        batch_size = args.batch_size * max(1, args.num_gpus)
         normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
         jitter_param = 0.4
         lighting_param = 0.1
@@ -281,17 +292,18 @@ def batch_fn(batch, ctx):
 
         return train_data, val_data, batch_fn
 
-    if opt.use_rec:
-        train_data, val_data, batch_fn = get_data_rec(opt.rec_train, opt.rec_train_idx,
-                                                    opt.rec_val, opt.rec_val_idx,
-                                                    batch_size, num_workers)
-    else:
-        train_data, val_data, batch_fn = get_data_loader(opt.data_dir, batch_size, num_workers)
+    def get_data_loader(args):
+        if args.data_backend == 'dali-gpu':
+            return (lambda *args, **kwargs: dali.get_rec_iter(*args, **kwargs, batch_fn=batch_func, dali_cpu=False))
+        if args.data_backend == 'dali-cpu':
+            return (lambda *args, **kwargs: dali.get_rec_iter(*args, **kwargs, batch_fn=batch_func, dali_cpu=True))
+        if args.data_backend == 'mxnet':
+            return get_data_rec if args.use_rec else get_data_rec_transfomed
+        raise ValueError('Wrong data backend')
 
-    if opt.mixup:
-        train_metric = mx.metric.RMSE()
-    else:
-        train_metric = mx.metric.Accuracy()
+
+    train_data, val_data, batch_fn = get_data_loader(opt)(opt)
+    train_metric = mx.metric.RMSE() if opt.mixup else mx.metric.Accuracy()
     acc_top1 = mx.metric.Accuracy()
     acc_top5 = mx.metric.TopKAccuracy(5)
 
@@ -322,16 +334,17 @@ def smooth(label, classes, eta=0.1):
             smoothed.append(res)
         return smoothed
 
-    def test(ctx, val_data):
+    def test(ctx, val_data, val_batch):
         if opt.use_rec:
             val_data.reset()
         acc_top1.reset()
         acc_top5.reset()
         for i, batch in enumerate(val_data):
-            data, label = batch_fn(batch, ctx)
-            outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
-            acc_top1.update(label, outputs)
-            acc_top5.update(label, outputs)
+            for j in range(val_batch):
+                data, label = batch_fn(batch[j], ctx) if type(batch) == list else batch_fn(batch, ctx)
+                outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
+                acc_top1.update(label, outputs)
+                acc_top5.update(label, outputs)
 
         _, top1 = acc_top1.get()
         _, top5 = acc_top5.get()
@@ -351,10 +364,7 @@ def train(ctx):
         if opt.resume_states != '':
             trainer.load_states(opt.resume_states)
 
-        if opt.label_smoothing or opt.mixup:
-            sparse_label_loss = False
-        else:
-            sparse_label_loss = True
+        sparse_label_loss = not (opt.label_smoothing or opt.mixup)
         if distillation:
             L = gcv.loss.DistillationSoftmaxCrossEntropyLoss(temperature=opt.temperature,
                                                                  hard_weight=opt.hard_weight,
@@ -363,7 +373,9 @@ def train(ctx):
             L = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss)
 
         best_val_score = 1
-
+        eta = 0.1 if opt.label_smoothing else 0.0
+        start_time = time.time()
+        val_batch = len(ctx) if opt.data_backend != 'mxnet' else 1
         for epoch in range(opt.resume_epoch, opt.num_epochs):
             tic = time.time()
             if opt.use_rec:
@@ -372,64 +384,64 @@ def train(ctx):
             btic = time.time()
 
             for i, batch in enumerate(train_data):
-                data, label = batch_fn(batch, ctx)
-
-                if opt.mixup:
-                    lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha)
-                    if epoch >= opt.num_epochs - opt.mixup_off_epoch:
-                        lam = 1
-                    data = [lam*X + (1-lam)*X[::-1] for X in data]
+                for j in range(val_batch):
+                    data, label = batch_fn(batch[j], ctx) if type(batch) == list else batch_fn(batch, ctx)
+                    if opt.mixup:
+                        lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha)
+                        if epoch >= opt.num_epochs - opt.mixup_off_epoch:
+                            lam = 1
+                        data = [lam*X + (1-lam)*X[::-1] for X in data]
+                        label = mixup_transform(label, classes, lam, eta)
+
+                    elif opt.label_smoothing:
+                        hard_label = label
+                        label = smooth(label, classes)
 
-                    if opt.label_smoothing:
-                        eta = 0.1
-                    else:
-                        eta = 0.0
-                    label = mixup_transform(label, classes, lam, eta)
-
-                elif opt.label_smoothing:
-                    hard_label = label
-                    label = smooth(label, classes)
-
-                if distillation:
-                    teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \
-                                    for X in data]
-
-                with ag.record():
-                    outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
                     if distillation:
-                        loss = [L(yhat.astype('float32', copy=False),
-                                  y.astype('float32', copy=False),
-                                  p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)]
-                    else:
-                        loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)]
-                for l in loss:
-                    l.backward()
-                trainer.step(batch_size)
-
-                if opt.mixup:
-                    output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \
-                                    for out in outputs]
-                    train_metric.update(label, output_softmax)
-                else:
-                    if opt.label_smoothing:
-                        train_metric.update(hard_label, outputs)
+                        teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \
+                                        for X in data]
+
+                    with ag.record():
+                        outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
+                        if distillation:
+                            loss = [L(yhat.astype('float32', copy=False),
+                                      y.astype('float32', copy=False),
+                                      p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)]
+                        else:
+                            loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)]
+                    for l in loss:
+                        l.backward()
+                    trainer.step(batch_size)
+
+                    if opt.mixup:
+                        output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) for out in outputs]
+                        train_metric.update(label, output_softmax)
                     else:
-                        train_metric.update(label, outputs)
+                        if opt.label_smoothing:
+                            train_metric.update(hard_label, outputs)
+                        else:
+                            train_metric.update(label, outputs)
 
                 if opt.log_interval and not (i+1)%opt.log_interval:
                     train_metric_name, train_metric_score = train_metric.get()
                     logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f'%(
-                                epoch, i, batch_size*opt.log_interval/(time.time()-btic),
+                                epoch, i+1, batch_size*opt.log_interval*val_batch/(time.time()-btic),
                                 train_metric_name, train_metric_score, trainer.learning_rate))
                     btic = time.time()
 
             train_metric_name, train_metric_score = train_metric.get()
-            throughput = int(batch_size * i /(time.time() - tic))
+            if opt.log_interval and i % opt.log_interval:
+                # We did NOT report the speed on the last iteration of the loop. Let's do it now
+                logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f'%(
+                            epoch, i, batch_size*(i%opt.log_interval)*val_batch/(time.time()-btic),
+                            train_metric_name, train_metric_score, trainer.learning_rate))
 
-            err_top1_val, err_top5_val = test(ctx, val_data)
+            epoch_time = time.time() - tic
+            throughput = int(batch_size * i * val_batch / epoch_time)
+            err_top1_val, err_top5_val = test(ctx, val_data, val_batch)
 
             logger.info('[Epoch %d] training: %s=%f'%(epoch, train_metric_name, train_metric_score))
-            logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f'%(epoch, throughput, time.time()-tic))
+            logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f'%(epoch, throughput, epoch_time))
             logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f'%(epoch, err_top1_val, err_top5_val))
 
             if err_top1_val < best_val_score:
@@ -445,6 +457,7 @@ def train(ctx):
             net.save_parameters('%s/imagenet-%s-%d.params'%(save_dir, model_name, opt.num_epochs-1))
             trainer.save_states('%s/imagenet-%s-%d.states'%(save_dir, model_name, opt.num_epochs-1))
 
+        logger.info('Training time for %d epochs: %f sec.'%(opt.num_epochs, time.time() - start_time))
 
     if opt.mode == 'hybrid':
         net.hybridize(static_alloc=True, static_shape=True)

From 20232528ce30c272074506db61bc3307359b9941 Mon Sep 17 00:00:00 2001
From: andreii <andreii@nvidia.com>
Date: Thu, 25 Mar 2021 16:31:44 -0700
Subject: [PATCH 2/5] Default value for TRAIN_DATA_DIR changed

---
 scripts/classification/imagenet/test.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/classification/imagenet/test.sh b/scripts/classification/imagenet/test.sh
index 50e69f0221..0e56162981 100755
--- a/scripts/classification/imagenet/test.sh
+++ b/scripts/classification/imagenet/test.sh
@@ -22,7 +22,6 @@ fi
 
 if [ -z "$TRAIN_DATA_DIR" ]; then
   export TRAIN_DATA_DIR=~/.mxnet/datasets/imagenet
-  export TRAIN_DATA_DIR=/data/imagenet/train-480-val-256-recordio
 fi
 
 pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100

From dec5bc8a35c9ddd6da8475cfb93d06b11ad9004f Mon Sep 17 00:00:00 2001
From: andreii <andreii@nvidia.com>
Date: Mon, 29 Mar 2021 14:13:42 -0700
Subject: [PATCH 3/5] Installation of DALI moved into train_imagenet.py

---
 scripts/classification/imagenet/test.sh       |  4 ++-
 .../classification/imagenet/train_imagenet.py | 28 +++++++++++++++++--
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/scripts/classification/imagenet/test.sh b/scripts/classification/imagenet/test.sh
index 0e56162981..fbecaf77f4 100755
--- a/scripts/classification/imagenet/test.sh
+++ b/scripts/classification/imagenet/test.sh
@@ -24,7 +24,9 @@ if [ -z "$TRAIN_DATA_DIR" ]; then
   export TRAIN_DATA_DIR=~/.mxnet/datasets/imagenet
 fi
 
-pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100
+if [ -z "$DALI_VERSION" ]; then
+  export DALI_VERSION=nvidia-dali-cuda100
+fi
 
 python train_imagenet.py --model $MODEL --data-backend $DATA_BACKEND --num-gpus $NUM_GPUS \
       --num-epochs $NUM_EPOCHS --num-training-samples $NUM_TRAINING_SAMPLES --use-rec \
diff --git a/scripts/classification/imagenet/train_imagenet.py b/scripts/classification/imagenet/train_imagenet.py
index 99bc948e0f..08d18e2d50 100644
--- a/scripts/classification/imagenet/train_imagenet.py
+++ b/scripts/classification/imagenet/train_imagenet.py
@@ -14,6 +14,9 @@
 
 import dali
 
+dali_ver = 'DALI_VERSION'
+dali_version = os.environ.get(dali_ver) if dali_ver in os.environ else 'nvidia-dali-cuda100'
+
 # CLI
 def parse_args():
     def float_list(x):
@@ -138,6 +141,12 @@ def batch_func(batch, ctx):
     logger.addHandler(filehandler)
     logger.addHandler(streamhandler)
 
+    if opt.data_backend == 'dali-gpu' and opt.num_gpus == 0:
+        stream = os.popen('nvidia-smi -L | wc -l')
+        opt.num_gpus = int(stream.read())
+        logger.info("When '--data-backend' is equal to 'dali-gpu', then `--num-gpus` should NOT be 0\n" \
+                    "For now '--num-gpus' will be set to the number of GPUs installed: %d" % opt.num_gpus)
+
     logger.info(opt)
 
     classes = 1000
@@ -293,12 +302,27 @@ def batch_fn(batch, ctx):
         return train_data, val_data, batch_fn
 
     def get_data_loader(args):
+        if args.data_backend == 'mxnet':
+            return get_data_rec if args.use_rec else get_data_rec_transfomed
+
+        # Check if DALI is installed:
+        if args.data_backend[0:5] == 'dali-':
+            stream = os.popen("pip list | grep dali")
+            output = stream.read()
+            if output == '':
+                # DALI is not installed
+                logger.info('DALI is not installed\nTrying to install DALI version \'%s\'' % dali_version)
+                ret = os.system('pip install --extra-index-url https://developer.download.nvidia.com/compute/redist ' + dali_version)
+                if ret != 0:
+                    logger.info('Cannot install DALI version \'%s\'.\nPerhaps, the latest DALI version should be used.\n' \
+                                'Please, see documentation on ' \
+                                'https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html\n' \
+                                'and set the environment variable %s to the appropriate version ID' % (dali_version, dali_ver))
+                    raise RuntimeError('DALI is not installed')
         if args.data_backend == 'dali-gpu':
             return (lambda *args, **kwargs: dali.get_rec_iter(*args, **kwargs, batch_fn=batch_func, dali_cpu=False))
         if args.data_backend == 'dali-cpu':
             return (lambda *args, **kwargs: dali.get_rec_iter(*args, **kwargs, batch_fn=batch_func, dali_cpu=True))
-        if args.data_backend == 'mxnet':
-            return get_data_rec if args.use_rec else get_data_rec_transfomed
         raise ValueError('Wrong data backend')
 
 

From a0c1ee25b0da8ae82474da4ce4d2acd11b55ec81 Mon Sep 17 00:00:00 2001
From: andreii <andreii@nvidia.com>
Date: Wed, 7 Apr 2021 15:17:31 -0700
Subject: [PATCH 4/5] Installation of DALI moved into parse_args().

---
 scripts/classification/imagenet/test.sh       |  4 +-
 .../classification/imagenet/train_imagenet.py | 58 ++++++++++++-------
 2 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/scripts/classification/imagenet/test.sh b/scripts/classification/imagenet/test.sh
index fbecaf77f4..f978a543cd 100755
--- a/scripts/classification/imagenet/test.sh
+++ b/scripts/classification/imagenet/test.sh
@@ -24,8 +24,8 @@ if [ -z "$TRAIN_DATA_DIR" ]; then
   export TRAIN_DATA_DIR=~/.mxnet/datasets/imagenet
 fi
 
-if [ -z "$DALI_VERSION" ]; then
-  export DALI_VERSION=nvidia-dali-cuda100
+if [ -z "$DALI_VER" ]; then
+  export DALI_VER=nvidia-dali-cuda100
 fi
 
 python train_imagenet.py --model $MODEL --data-backend $DATA_BACKEND --num-gpus $NUM_GPUS \
diff --git a/scripts/classification/imagenet/train_imagenet.py b/scripts/classification/imagenet/train_imagenet.py
index 08d18e2d50..e02e17ca69 100644
--- a/scripts/classification/imagenet/train_imagenet.py
+++ b/scripts/classification/imagenet/train_imagenet.py
@@ -12,13 +12,8 @@
 from gluoncv.model_zoo import get_model
 from gluoncv.utils import makedirs, LRSequential, LRScheduler
 
-import dali
-
-dali_ver = 'DALI_VERSION'
-dali_version = os.environ.get(dali_ver) if dali_ver in os.environ else 'nvidia-dali-cuda100'
-
 # CLI
-def parse_args():
+def parse_args(logger = None):
     def float_list(x):
         return list(map(float, x.split(',')))
 
@@ -120,8 +115,40 @@ def float_list(x):
                         help='a tuple of size 3 for the std rgb')
     parser.add_argument('--num-training-samples', type=int, default=1281167,
                         help='Number of training samples')
-    parser = dali.add_dali_args(parser)
-    dali.add_data_args(parser)
+    if logger:
+        # DALI is expected to be used
+        try:
+            import dali
+        except ImportError:
+            raise ImportError('Unable to import modules dali.py')
+
+        dali_ver = 'DALI_VER'
+        dali_version = os.environ.get(dali_ver) if dali_ver in os.environ else 'nvidia-dali-cuda100'
+        stream = os.popen("pip list | grep dali")
+        output = stream.read()
+        if output == '':
+            # DALI is not installed
+            cmd_install = 'pip install --extra-index-url https://developer.download.nvidia.com/compute/redist ' + dali_version
+            logger.info('DALI is supposed to be used, but it is not installed.\nWe can try to install it for you (and continue this test) OR\n' \
+                        'this test will be stopped and you can later restart it after installing DALI manually')
+            answer = input('Do you want to install DALI now? (Y/N):')
+            if answer[0] == 'Y' or answer[0] == 'y':
+                logger.info('Trying to install DALI version \'%s\'' % dali_version)
+                ret = os.system(cmd_install)
+                if ret != 0:
+                    logger.info('Cannot install DALI version \'%s\'.\n' \
+                                'Perhaps, the latest DALI version should be used.\n' % dali_version)
+            else:
+                ret = 1
+                logger.info('To install DALI, please, use:\n' + cmd_install)
+        if ret != 0:
+            logger.info('Please, see documentation on ' \
+                        'https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html\n' \
+                        'and set the environment variable %s to the appropriate version ID (default is \'%s\')' % (dali_ver, dali_version))
+            raise RuntimeError('DALI is not installed')
+
+        parser = dali.add_dali_args(parser)
+        dali.add_data_args(parser)
     return parser.parse_args()
 
 
@@ -140,6 +167,8 @@ def batch_func(batch, ctx):
     logger.setLevel(logging.INFO)
     logger.addHandler(filehandler)
     logger.addHandler(streamhandler)
+    if opt.data_backend[0:5] == 'dali-':
+        opt = parse_args(logger)   # Adding DALI parameters
 
     if opt.data_backend == 'dali-gpu' and opt.num_gpus == 0:
         stream = os.popen('nvidia-smi -L | wc -l')
@@ -307,18 +336,7 @@ def get_data_loader(args):
 
         # Check if DALI is installed:
         if args.data_backend[0:5] == 'dali-':
-            stream = os.popen("pip list | grep dali")
-            output = stream.read()
-            if output == '':
-                # DALI is not installed
-                logger.info('DALI is not installed\nTrying to install DALI version \'%s\'' % dali_version)
-                ret = os.system('pip install --extra-index-url https://developer.download.nvidia.com/compute/redist ' + dali_version)
-                if ret != 0:
-                    logger.info('Cannot install DALI version \'%s\'.\nPerhaps, the latest DALI version should be used.\n' \
-                                'Please, see documentation on ' \
-                                'https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html\n' \
-                                'and set the environment variable %s to the appropriate version ID' % (dali_version, dali_ver))
-                    raise RuntimeError('DALI is not installed')
+            import dali
         if args.data_backend == 'dali-gpu':
             return (lambda *args, **kwargs: dali.get_rec_iter(*args, **kwargs, batch_fn=batch_func, dali_cpu=False))
         if args.data_backend == 'dali-cpu':

From 5dccefaad90ab5ee78885a12bbfb4c2c8dd6872b Mon Sep 17 00:00:00 2001
From: andreii <andreii@nvidia.com>
Date: Sat, 10 Apr 2021 20:44:33 -0700
Subject: [PATCH 5/5] Changes, suggested by Yin Weisu

---
 .../classification/imagenet/train_imagenet.py | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/scripts/classification/imagenet/train_imagenet.py b/scripts/classification/imagenet/train_imagenet.py
index e02e17ca69..83bab49ad5 100644
--- a/scripts/classification/imagenet/train_imagenet.py
+++ b/scripts/classification/imagenet/train_imagenet.py
@@ -117,16 +117,10 @@ def float_list(x):
                         help='Number of training samples')
     if logger:
         # DALI is expected to be used
-        try:
-            import dali
-        except ImportError:
-            raise ImportError('Unable to import modules dali.py')
-
         dali_ver = 'DALI_VER'
         dali_version = os.environ.get(dali_ver) if dali_ver in os.environ else 'nvidia-dali-cuda100'
-        stream = os.popen("pip list | grep dali")
-        output = stream.read()
-        if output == '':
+        stream = os.popen("pip list --format=columns | grep dali")
+        if stream.read() == '':
             # DALI is not installed
             cmd_install = 'pip install --extra-index-url https://developer.download.nvidia.com/compute/redist ' + dali_version
             logger.info('DALI is supposed to be used, but it is not installed.\nWe can try to install it for you (and continue this test) OR\n' \
@@ -141,11 +135,15 @@ def float_list(x):
             else:
                 ret = 1
                 logger.info('To install DALI, please, use:\n' + cmd_install)
-        if ret != 0:
-            logger.info('Please, see documentation on ' \
-                        'https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html\n' \
-                        'and set the environment variable %s to the appropriate version ID (default is \'%s\')' % (dali_ver, dali_version))
-            raise RuntimeError('DALI is not installed')
+            if ret != 0:
+                logger.info('Please, see documentation on ' \
+                            'https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html\n' \
+                            'and set the environment variable %s to the appropriate version ID (default is \'%s\')' % (dali_ver, dali_version))
+                raise RuntimeError('DALI is not installed')
+        try:
+            import dali
+        except ImportError:
+            raise ImportError('Unable to import modules dali.py')
 
         parser = dali.add_dali_args(parser)
         dali.add_data_args(parser)