From 68c5d60d8816ae202f19ec0dba063d1b322bef9a Mon Sep 17 00:00:00 2001 From: andreii Date: Thu, 25 Mar 2021 16:29:45 -0700 Subject: [PATCH 1/5] Implementation of the use of DALI. --- scripts/classification/imagenet/dali.py | 197 +++++++++++++++ scripts/classification/imagenet/test.sh | 35 +++ .../classification/imagenet/train_imagenet.py | 239 +++++++++--------- 3 files changed, 358 insertions(+), 113 deletions(-) create mode 100644 scripts/classification/imagenet/dali.py create mode 100755 scripts/classification/imagenet/test.sh diff --git a/scripts/classification/imagenet/dali.py b/scripts/classification/imagenet/dali.py new file mode 100644 index 0000000000..cd42c41550 --- /dev/null +++ b/scripts/classification/imagenet/dali.py @@ -0,0 +1,197 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from nvidia.dali.pipeline import Pipeline +import nvidia.dali.ops as ops +import nvidia.dali.types as types +from nvidia.dali.plugin.mxnet import DALIClassificationIterator, LastBatchPolicy +import horovod.mxnet as hvd + + +def add_dali_args(parser): + group = parser.add_argument_group('DALI data backend', 'entire group applies only to dali data backend') + group.add_argument('--dali-separ-val', action='store_true', + help='each process will perform independent validation on whole val-set') + group.add_argument('--dali-threads', type=int, default=3, help="number of threads" +\ + "per GPU for DALI") + group.add_argument('--dali-validation-threads', type=int, default=10, help="number of threads" +\ + "per GPU for DALI for validation") + group.add_argument('--dali-prefetch-queue', type=int, default=2, help="DALI prefetch queue depth") + group.add_argument('--dali-nvjpeg-memory-padding', type=int, default=64, help="Memory padding value for nvJPEG (in MB)") + group.add_argument('--dali-fuse-decoder', type=int, default=1, help="0 or 1 whether to fuse decoder or not") + group.add_argument('--flag', type=int, default=1, help="Flag") + return parser + +def add_data_args(parser): + def int_list(x): + return list(map(int, x.split(','))) + + data = parser.add_argument_group('Data') + data.add_argument('--data-pred', type=str, help='the image on which run inference (only for pred mode)') + data.add_argument('--image-shape', type=int_list, default=[3, 224, 224], + help='the image shape feed into the network') + + data.add_argument('--input-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'), + help='the layout of the input data') + data.add_argument('--conv-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'), + help='the layout of the data assumed by the conv operation') + data.add_argument('--batchnorm-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'), + help='the layout of the data assumed by the batchnorm operation') + data.add_argument('--pooling-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'), + help='the layout of the data assumed by the pooling operation') + + data.add_argument('--num-examples', type=int, default=1281167, + help="the number of training examples (doesn't work with mxnet data backend)") + data.add_argument('--data-val-resize', type=int, default=256, + help='base length of shorter edge for validation dataset') + data.add_argument('--kv-store', type=str, default='device', choices=('device', 'horovod'), + help='key-value store type') + return data + + +def get_device_names(dali_cpu): + return ("cpu", "cpu") if dali_cpu else ("gpu", "mixed") + + +class HybridTrainPipe(Pipeline): + def __init__(self, args, batch_size, num_threads, device_id, rec_path, idx_path, + shard_id, num_shards, crop_shape, nvjpeg_padding, prefetch_queue=3, + output_layout=types.NCHW, pad_output=True, dtype='float16', dali_cpu=False): + super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id, prefetch_queue_depth = prefetch_queue) + self.input = ops.MXNetReader(path=[rec_path], index_path=[idx_path], + random_shuffle=True, shard_id=shard_id, num_shards=num_shards) + + dali_device, decoder_device = get_device_names(dali_cpu) + if args.dali_fuse_decoder: + self.decode = ops.ImageDecoderRandomCrop(device=decoder_device, output_type=types.RGB, + device_memory_padding=nvjpeg_padding, host_memory_padding=nvjpeg_padding) + else: + self.decode = ops.ImageDecoder(device=decoder_device, output_type=types.RGB, + device_memory_padding=nvjpeg_padding, host_memory_padding=nvjpeg_padding) + + if args.dali_fuse_decoder: + self.resize = ops.Resize(device=dali_device, resize_x=crop_shape[1], resize_y=crop_shape[0]) + else: + self.resize = ops.RandomResizedCrop(device=dali_device, size=crop_shape) + + self.cmnp = ops.CropMirrorNormalize(device="gpu", + dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT, + output_layout=output_layout, crop=crop_shape, pad_output=pad_output, + mean=args.rgb_mean, std=args.rgb_std) + self.coin = ops.random.CoinFlip(probability=0.5) + + def define_graph(self): + rng = self.coin() + self.jpegs, self.labels = self.input(name="Reader") + + images = self.decode(self.jpegs) + images = self.resize(images) + output = self.cmnp(images.gpu(), mirror=rng) + return [output, self.labels] + + +class HybridValPipe(Pipeline): + def __init__(self, args, batch_size, num_threads, device_id, rec_path, idx_path, + shard_id, num_shards, crop_shape, nvjpeg_padding, prefetch_queue=3, resize_shp=None, + output_layout=types.NCHW, pad_output=True, dtype='float16', dali_cpu=False): + super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id, prefetch_queue_depth=prefetch_queue) + self.input = ops.MXNetReader(path=[rec_path], index_path=[idx_path], + random_shuffle=False, shard_id=shard_id, num_shards=num_shards) + + dali_device, decoder_device = get_device_names(dali_cpu) + self.decode = ops.ImageDecoder(device=decoder_device, output_type=types.RGB, + device_memory_padding=nvjpeg_padding, + host_memory_padding=nvjpeg_padding) + self.resize = ops.Resize(device=dali_device, resize_shorter=resize_shp) if resize_shp else None + self.cmnp = ops.CropMirrorNormalize(device="gpu", + dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT, + output_layout=output_layout, crop=crop_shape, pad_output=pad_output, + mean=args.rgb_mean, std=args.rgb_std) + + def define_graph(self): + self.jpegs, self.labels = self.input(name="Reader") + images = self.decode(self.jpegs) + if self.resize: + images = self.resize(images) + output = self.cmnp(images.gpu()) + return [output, self.labels] + + +def get_rec_iter(args, kv=None, batch_fn=None, dali_cpu=False): + devices = [0] if dali_cpu else args.gpus + num_devices = len(devices) + pad_output = (args.image_shape[0] == 4) + + # the input_layout w.r.t. the model is the output_layout of the image pipeline + output_layout = types.NHWC if args.input_layout == 'NHWC' else types.NCHW + + if 'horovod' in args.kv_store: + rank = hvd.rank() + nWrk = hvd.size() + else: + rank = kv.rank if kv else 0 + nWrk = kv.num_workers if kv else 1 + + batch_size = args.batch_size // nWrk * num_devices + + trainpipes = [HybridTrainPipe(args = args, + batch_size = batch_size, + num_threads = args.dali_threads, + device_id = dev_id, + rec_path = args.rec_train, + idx_path = args.rec_train_idx, + shard_id = devices.index(dev_id) + num_devices*rank, + num_shards = num_devices*nWrk, + crop_shape = args.image_shape[1:], + output_layout = output_layout, + dtype = args.dtype, + pad_output = pad_output, + dali_cpu = dali_cpu, + nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024, + prefetch_queue = args.dali_prefetch_queue) for dev_id in devices] + trainpipes[0].build() + num_examples = trainpipes[0].epoch_size("Reader") + if args.num_examples < num_examples: + warnings.warn("{} training examples will be used, although full training set contains {} examples".format(args.num_examples, num_examples)) + + train_examples = args.num_examples // nWrk + dali_train_iter = DALIClassificationIterator(trainpipes, train_examples) + if not args.rec_val: + return dali_train_iter, None, batch_fn + + valpipes = [HybridValPipe(args = args, + batch_size = batch_size, + num_threads = args.dali_validation_threads, + device_id = dev_id, + rec_path = args.rec_val, + idx_path = args.rec_val_idx, + shard_id = 0 if args.dali_separ_val else devices.index(dev_id) + num_devices*rank, + num_shards = 1 if args.dali_separ_val else num_devices*nWrk, + crop_shape = args.image_shape[1:], + resize_shp = args.data_val_resize, + output_layout = output_layout, + dtype = args.dtype, + pad_output = pad_output, + dali_cpu = dali_cpu, + nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024, + prefetch_queue = args.dali_prefetch_queue) for dev_id in devices] + valpipes[0].build() + worker_val_examples = valpipes[0].epoch_size("Reader") + if not args.dali_separ_val: + adj = 1 if rank < worker_val_examples % nWrk else 0 + worker_val_examples = adj + worker_val_examples // nWrk + + dali_val_iter = DALIClassificationIterator(valpipes, worker_val_examples) + return dali_train_iter, dali_val_iter, batch_fn diff --git a/scripts/classification/imagenet/test.sh b/scripts/classification/imagenet/test.sh new file mode 100755 index 0000000000..50e69f0221 --- /dev/null +++ b/scripts/classification/imagenet/test.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +if [ -z "$MODEL" ]; then + export MODEL=resnet18_v1 +fi + +if [ -z "$NUM_TRAINING_SAMPLES" ]; then + export NUM_TRAINING_SAMPLES=1281167 +fi + +if [ -z "$NUM_EPOCHS" ]; then + export NUM_EPOCHS=3 +fi + +if [ -z "$NUM_GPUS" ] || [ $NUM_GPUS '-lt' 0 ]; then + export NUM_GPUS=0 +fi + +if [ -z "$DATA_BACKEND" ]; then + export DATA_BACKEND='mxnet' # Options are: dali-gpu, dali-cpu, mxnet +fi + +if [ -z "$TRAIN_DATA_DIR" ]; then + export TRAIN_DATA_DIR=~/.mxnet/datasets/imagenet + export TRAIN_DATA_DIR=/data/imagenet/train-480-val-256-recordio +fi + +pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100 + +python train_imagenet.py --model $MODEL --data-backend $DATA_BACKEND --num-gpus $NUM_GPUS \ + --num-epochs $NUM_EPOCHS --num-training-samples $NUM_TRAINING_SAMPLES --use-rec \ + --rec-train $TRAIN_DATA_DIR/train.rec --rec-train-idx $TRAIN_DATA_DIR/train.idx \ + --rec-val $TRAIN_DATA_DIR/val.rec --rec-val-idx $TRAIN_DATA_DIR/val.idx --data-dir $TRAIN_DATA_DIR \ + + diff --git a/scripts/classification/imagenet/train_imagenet.py b/scripts/classification/imagenet/train_imagenet.py index 7bd70f0023..99bc948e0f 100644 --- a/scripts/classification/imagenet/train_imagenet.py +++ b/scripts/classification/imagenet/train_imagenet.py @@ -2,7 +2,6 @@ import numpy as np import mxnet as mx -import gluoncv as gcv from mxnet import gluon, nd from mxnet import autograd as ag from mxnet.gluon.data.vision import transforms @@ -13,18 +12,26 @@ from gluoncv.model_zoo import get_model from gluoncv.utils import makedirs, LRSequential, LRScheduler +import dali + # CLI def parse_args(): + def float_list(x): + return list(map(float, x.split(','))) + + data_dir = '~/.mxnet/datasets/imagenet/' parser = argparse.ArgumentParser(description='Train a model for image classification.') - parser.add_argument('--data-dir', type=str, default='~/.mxnet/datasets/imagenet', + parser.add_argument('--data-backend', choices=('dali-gpu', 'dali-cpu', 'mxnet'), default='mxnet', + help='set data loading & augmentation backend') + parser.add_argument('--data-dir', type=str, default=data_dir, help='training and validation pictures to use.') - parser.add_argument('--rec-train', type=str, default='~/.mxnet/datasets/imagenet/rec/train.rec', + parser.add_argument('--rec-train', type=str, default=data_dir+'rec/train.rec', help='the training data') - parser.add_argument('--rec-train-idx', type=str, default='~/.mxnet/datasets/imagenet/rec/train.idx', + parser.add_argument('--rec-train-idx', type=str, default=data_dir+'rec/train.idx', help='the index of training data') - parser.add_argument('--rec-val', type=str, default='~/.mxnet/datasets/imagenet/rec/val.rec', + parser.add_argument('--rec-val', type=str, default=data_dir+'rec/val.rec', help='the validation data') - parser.add_argument('--rec-val-idx', type=str, default='~/.mxnet/datasets/imagenet/rec/val.idx', + parser.add_argument('--rec-val-idx', type=str, default=data_dir+'rec/val.idx', help='the index of validation data') parser.add_argument('--use-rec', action='store_true', help='use image record iter for data input. default is false.') @@ -104,11 +111,23 @@ def parse_args(): help='name of training log file') parser.add_argument('--use-gn', action='store_true', help='whether to use group norm.') - opt = parser.parse_args() - return opt + parser.add_argument('--rgb-mean', type=float_list, default=[123.68, 116.779, 103.939], + help='a tuple of size 3 for the mean rgb') + parser.add_argument('--rgb-std', type=float_list, default=[58.393, 57.12, 57.375], + help='a tuple of size 3 for the std rgb') + parser.add_argument('--num-training-samples', type=int, default=1281167, + help='Number of training samples') + parser = dali.add_dali_args(parser) + dali.add_data_args(parser) + return parser.parse_args() def main(): + def batch_func(batch, ctx): + data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) + label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) + return data, label + opt = parse_args() filehandler = logging.FileHandler(opt.logging_file) @@ -121,14 +140,11 @@ def main(): logger.info(opt) - batch_size = opt.batch_size classes = 1000 - num_training_samples = 1281167 - num_gpus = opt.num_gpus - batch_size *= max(1, num_gpus) + batch_size = opt.batch_size * max(1, num_gpus) context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()] - num_workers = opt.num_workers + opt.gpus = [i for i in range(num_gpus)] lr_decay = opt.lr_decay lr_decay_period = opt.lr_decay_period @@ -137,7 +153,7 @@ def main(): else: lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')] lr_decay_epoch = [e - opt.warmup_epochs for e in lr_decay_epoch] - num_batches = num_training_samples // batch_size + num_batches = opt.num_training_samples // batch_size lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=0, target_lr=opt.lr, @@ -173,47 +189,39 @@ def main(): net.load_parameters(opt.resume_params, ctx = context) # teacher model for distillation training - if opt.teacher is not None and opt.hard_weight < 1.0: - teacher_name = opt.teacher - teacher = get_model(teacher_name, pretrained=True, classes=classes, ctx=context) + distillation = opt.teacher is not None and opt.hard_weight < 1.0 + if distillation: + teacher = get_model(opt.teacher, pretrained=True, classes=classes, ctx=context) teacher.cast(opt.dtype) - distillation = True - else: - distillation = False # Two functions for reading data from record file or raw images - def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_size, num_workers): - rec_train = os.path.expanduser(rec_train) - rec_train_idx = os.path.expanduser(rec_train_idx) - rec_val = os.path.expanduser(rec_val) - rec_val_idx = os.path.expanduser(rec_val_idx) + def get_data_rec(args): + rec_train = os.path.expanduser(args.rec_train) + rec_train_idx = os.path.expanduser(args.rec_train_idx) + rec_val = os.path.expanduser(args.rec_val) + rec_val_idx = os.path.expanduser(args.rec_val_idx) + num_gpus = args.num_gpus + batch_size = args.batch_size * max(1, num_gpus) + jitter_param = 0.4 lighting_param = 0.1 input_size = opt.input_size crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875 resize = int(math.ceil(input_size / crop_ratio)) - mean_rgb = [123.68, 116.779, 103.939] - std_rgb = [58.393, 57.12, 57.375] - - def batch_fn(batch, ctx): - data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) - label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) - return data, label train_data = mx.io.ImageRecordIter( path_imgrec = rec_train, path_imgidx = rec_train_idx, - preprocess_threads = num_workers, + preprocess_threads = args.num_workers, shuffle = True, batch_size = batch_size, - data_shape = (3, input_size, input_size), - mean_r = mean_rgb[0], - mean_g = mean_rgb[1], - mean_b = mean_rgb[2], - std_r = std_rgb[0], - std_g = std_rgb[1], - std_b = std_rgb[2], + mean_r = args.rgb_mean[0], + mean_g = args.rgb_mean[1], + mean_b = args.rgb_mean[2], + std_r = args.rgb_std[0], + std_g = args.rgb_std[1], + std_b = args.rgb_std[2], rand_mirror = True, random_resized_crop = True, max_aspect_ratio = 4. / 3., @@ -228,22 +236,25 @@ def batch_fn(batch, ctx): val_data = mx.io.ImageRecordIter( path_imgrec = rec_val, path_imgidx = rec_val_idx, - preprocess_threads = num_workers, + preprocess_threads = args.num_workers, shuffle = False, batch_size = batch_size, resize = resize, data_shape = (3, input_size, input_size), - mean_r = mean_rgb[0], - mean_g = mean_rgb[1], - mean_b = mean_rgb[2], - std_r = std_rgb[0], - std_g = std_rgb[1], - std_b = std_rgb[2], + mean_r = args.rgb_mean[0], + mean_g = args.rgb_mean[1], + mean_b = args.rgb_mean[2], + std_r = args.rgb_std[0], + std_g = args.rgb_std[1], + std_b = args.rgb_std[2], ) - return train_data, val_data, batch_fn + return train_data, val_data, batch_func - def get_data_loader(data_dir, batch_size, num_workers): + def get_data_rec_transfomed(args): + data_dir = args.data_dir + num_workers = args.num_workers + batch_size = args.batch_size * max(1, args.num_gpus) normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) jitter_param = 0.4 lighting_param = 0.1 @@ -281,17 +292,18 @@ def batch_fn(batch, ctx): return train_data, val_data, batch_fn - if opt.use_rec: - train_data, val_data, batch_fn = get_data_rec(opt.rec_train, opt.rec_train_idx, - opt.rec_val, opt.rec_val_idx, - batch_size, num_workers) - else: - train_data, val_data, batch_fn = get_data_loader(opt.data_dir, batch_size, num_workers) + def get_data_loader(args): + if args.data_backend == 'dali-gpu': + return (lambda *args, **kwargs: dali.get_rec_iter(*args, **kwargs, batch_fn=batch_func, dali_cpu=False)) + if args.data_backend == 'dali-cpu': + return (lambda *args, **kwargs: dali.get_rec_iter(*args, **kwargs, batch_fn=batch_func, dali_cpu=True)) + if args.data_backend == 'mxnet': + return get_data_rec if args.use_rec else get_data_rec_transfomed + raise ValueError('Wrong data backend') - if opt.mixup: - train_metric = mx.metric.RMSE() - else: - train_metric = mx.metric.Accuracy() + + train_data, val_data, batch_fn = get_data_loader(opt)(opt) + train_metric = mx.metric.RMSE() if opt.mixup else mx.metric.Accuracy() acc_top1 = mx.metric.Accuracy() acc_top5 = mx.metric.TopKAccuracy(5) @@ -322,16 +334,17 @@ def smooth(label, classes, eta=0.1): smoothed.append(res) return smoothed - def test(ctx, val_data): + def test(ctx, val_data, val_batch): if opt.use_rec: val_data.reset() acc_top1.reset() acc_top5.reset() for i, batch in enumerate(val_data): - data, label = batch_fn(batch, ctx) - outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] - acc_top1.update(label, outputs) - acc_top5.update(label, outputs) + for j in range(val_batch): + data, label = batch_fn(batch[j], ctx) if type(batch) == list else batch_fn(batch, ctx) + outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] + acc_top1.update(label, outputs) + acc_top5.update(label, outputs) _, top1 = acc_top1.get() _, top5 = acc_top5.get() @@ -351,10 +364,7 @@ def train(ctx): if opt.resume_states != '': trainer.load_states(opt.resume_states) - if opt.label_smoothing or opt.mixup: - sparse_label_loss = False - else: - sparse_label_loss = True + sparse_label_loss = not (opt.label_smoothing or opt.mixup) if distillation: L = gcv.loss.DistillationSoftmaxCrossEntropyLoss(temperature=opt.temperature, hard_weight=opt.hard_weight, @@ -363,7 +373,9 @@ def train(ctx): L = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss) best_val_score = 1 - + eta = 0.1 if opt.label_smoothing else 0.0 + start_time = time.time() + val_batch = len(ctx) if opt.data_backend != 'mxnet' else 1 for epoch in range(opt.resume_epoch, opt.num_epochs): tic = time.time() if opt.use_rec: @@ -372,64 +384,64 @@ def train(ctx): btic = time.time() for i, batch in enumerate(train_data): - data, label = batch_fn(batch, ctx) - - if opt.mixup: - lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha) - if epoch >= opt.num_epochs - opt.mixup_off_epoch: - lam = 1 - data = [lam*X + (1-lam)*X[::-1] for X in data] + for j in range(val_batch): + data, label = batch_fn(batch[j], ctx) if type(batch) == list else batch_fn(batch, ctx) + if opt.mixup: + lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha) + if epoch >= opt.num_epochs - opt.mixup_off_epoch: + lam = 1 + data = [lam*X + (1-lam)*X[::-1] for X in data] + label = mixup_transform(label, classes, lam, eta) + + elif opt.label_smoothing: + hard_label = label + label = smooth(label, classes) - if opt.label_smoothing: - eta = 0.1 - else: - eta = 0.0 - label = mixup_transform(label, classes, lam, eta) - - elif opt.label_smoothing: - hard_label = label - label = smooth(label, classes) - - if distillation: - teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \ - for X in data] - - with ag.record(): - outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] if distillation: - loss = [L(yhat.astype('float32', copy=False), - y.astype('float32', copy=False), - p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)] - else: - loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)] - for l in loss: - l.backward() - trainer.step(batch_size) - - if opt.mixup: - output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \ - for out in outputs] - train_metric.update(label, output_softmax) - else: - if opt.label_smoothing: - train_metric.update(hard_label, outputs) + teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \ + for X in data] + + with ag.record(): + outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] + if distillation: + loss = [L(yhat.astype('float32', copy=False), + y.astype('float32', copy=False), + p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)] + else: + loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)] + for l in loss: + l.backward() + trainer.step(batch_size) + + if opt.mixup: + output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) for out in outputs] + train_metric.update(label, output_softmax) else: - train_metric.update(label, outputs) + if opt.label_smoothing: + train_metric.update(hard_label, outputs) + else: + train_metric.update(label, outputs) if opt.log_interval and not (i+1)%opt.log_interval: train_metric_name, train_metric_score = train_metric.get() logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f'%( - epoch, i, batch_size*opt.log_interval/(time.time()-btic), + epoch, i+1, batch_size*opt.log_interval*val_batch/(time.time()-btic), train_metric_name, train_metric_score, trainer.learning_rate)) btic = time.time() train_metric_name, train_metric_score = train_metric.get() - throughput = int(batch_size * i /(time.time() - tic)) + if opt.log_interval and i % opt.log_interval: + # We did NOT report the speed on the last iteration of the loop. Let's do it now + logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f'%( + epoch, i, batch_size*(i%opt.log_interval)*val_batch/(time.time()-btic), + train_metric_name, train_metric_score, trainer.learning_rate)) - err_top1_val, err_top5_val = test(ctx, val_data) + epoch_time = time.time() - tic + throughput = int(batch_size * i * val_batch / epoch_time) + err_top1_val, err_top5_val = test(ctx, val_data, val_batch) logger.info('[Epoch %d] training: %s=%f'%(epoch, train_metric_name, train_metric_score)) - logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f'%(epoch, throughput, time.time()-tic)) + logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f'%(epoch, throughput, epoch_time)) logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f'%(epoch, err_top1_val, err_top5_val)) if err_top1_val < best_val_score: @@ -445,6 +457,7 @@ def train(ctx): net.save_parameters('%s/imagenet-%s-%d.params'%(save_dir, model_name, opt.num_epochs-1)) trainer.save_states('%s/imagenet-%s-%d.states'%(save_dir, model_name, opt.num_epochs-1)) + logger.info('Training time for %d epochs: %f sec.'%(opt.num_epochs, time.time() - start_time)) if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) From 20232528ce30c272074506db61bc3307359b9941 Mon Sep 17 00:00:00 2001 From: andreii Date: Thu, 25 Mar 2021 16:31:44 -0700 Subject: [PATCH 2/5] Default value for TRAIN_DATA_DIR changed --- scripts/classification/imagenet/test.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/classification/imagenet/test.sh b/scripts/classification/imagenet/test.sh index 50e69f0221..0e56162981 100755 --- a/scripts/classification/imagenet/test.sh +++ b/scripts/classification/imagenet/test.sh @@ -22,7 +22,6 @@ fi if [ -z "$TRAIN_DATA_DIR" ]; then export TRAIN_DATA_DIR=~/.mxnet/datasets/imagenet - export TRAIN_DATA_DIR=/data/imagenet/train-480-val-256-recordio fi pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100 From dec5bc8a35c9ddd6da8475cfb93d06b11ad9004f Mon Sep 17 00:00:00 2001 From: andreii Date: Mon, 29 Mar 2021 14:13:42 -0700 Subject: [PATCH 3/5] Installation of DALI moved into train_imagenet.py --- scripts/classification/imagenet/test.sh | 4 ++- .../classification/imagenet/train_imagenet.py | 28 +++++++++++++++++-- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/scripts/classification/imagenet/test.sh b/scripts/classification/imagenet/test.sh index 0e56162981..fbecaf77f4 100755 --- a/scripts/classification/imagenet/test.sh +++ b/scripts/classification/imagenet/test.sh @@ -24,7 +24,9 @@ if [ -z "$TRAIN_DATA_DIR" ]; then export TRAIN_DATA_DIR=~/.mxnet/datasets/imagenet fi -pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100 +if [ -z "$DALI_VERSION" ]; then + export DALI_VERSION=nvidia-dali-cuda100 +fi python train_imagenet.py --model $MODEL --data-backend $DATA_BACKEND --num-gpus $NUM_GPUS \ --num-epochs $NUM_EPOCHS --num-training-samples $NUM_TRAINING_SAMPLES --use-rec \ diff --git a/scripts/classification/imagenet/train_imagenet.py b/scripts/classification/imagenet/train_imagenet.py index 99bc948e0f..08d18e2d50 100644 --- a/scripts/classification/imagenet/train_imagenet.py +++ b/scripts/classification/imagenet/train_imagenet.py @@ -14,6 +14,9 @@ import dali +dali_ver = 'DALI_VERSION' +dali_version = os.environ.get(dali_ver) if dali_ver in os.environ else 'nvidia-dali-cuda100' + # CLI def parse_args(): def float_list(x): @@ -138,6 +141,12 @@ def batch_func(batch, ctx): logger.addHandler(filehandler) logger.addHandler(streamhandler) + if opt.data_backend == 'dali-gpu' and opt.num_gpus == 0: + stream = os.popen('nvidia-smi -L | wc -l') + opt.num_gpus = int(stream.read()) + logger.info("When '--data-backend' is equal to 'dali-gpu', then `--num-gpus` should NOT be 0\n" \ + "For now '--num-gpus' will be set to the number of GPUs installed: %d" % opt.num_gpus) + logger.info(opt) classes = 1000 @@ -293,12 +302,27 @@ def batch_fn(batch, ctx): return train_data, val_data, batch_fn def get_data_loader(args): + if args.data_backend == 'mxnet': + return get_data_rec if args.use_rec else get_data_rec_transfomed + + # Check if DALI is installed: + if args.data_backend[0:5] == 'dali-': + stream = os.popen("pip list | grep dali") + output = stream.read() + if output == '': + # DALI is not installed + logger.info('DALI is not installed\nTrying to install DALI version \'%s\'' % dali_version) + ret = os.system('pip install --extra-index-url https://developer.download.nvidia.com/compute/redist ' + dali_version) + if ret != 0: + logger.info('Cannot install DALI version \'%s\'.\nPerhaps, the latest DALI version should be used.\n' \ + 'Please, see documentation on ' \ + 'https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html\n' \ + 'and set the environment variable %s to the appropriate version ID' % (dali_version, dali_ver)) + raise RuntimeError('DALI is not installed') if args.data_backend == 'dali-gpu': return (lambda *args, **kwargs: dali.get_rec_iter(*args, **kwargs, batch_fn=batch_func, dali_cpu=False)) if args.data_backend == 'dali-cpu': return (lambda *args, **kwargs: dali.get_rec_iter(*args, **kwargs, batch_fn=batch_func, dali_cpu=True)) - if args.data_backend == 'mxnet': - return get_data_rec if args.use_rec else get_data_rec_transfomed raise ValueError('Wrong data backend') From a0c1ee25b0da8ae82474da4ce4d2acd11b55ec81 Mon Sep 17 00:00:00 2001 From: andreii Date: Wed, 7 Apr 2021 15:17:31 -0700 Subject: [PATCH 4/5] Installation of DALI moved into parse_args(). --- scripts/classification/imagenet/test.sh | 4 +- .../classification/imagenet/train_imagenet.py | 58 ++++++++++++------- 2 files changed, 40 insertions(+), 22 deletions(-) diff --git a/scripts/classification/imagenet/test.sh b/scripts/classification/imagenet/test.sh index fbecaf77f4..f978a543cd 100755 --- a/scripts/classification/imagenet/test.sh +++ b/scripts/classification/imagenet/test.sh @@ -24,8 +24,8 @@ if [ -z "$TRAIN_DATA_DIR" ]; then export TRAIN_DATA_DIR=~/.mxnet/datasets/imagenet fi -if [ -z "$DALI_VERSION" ]; then - export DALI_VERSION=nvidia-dali-cuda100 +if [ -z "$DALI_VER" ]; then + export DALI_VER=nvidia-dali-cuda100 fi python train_imagenet.py --model $MODEL --data-backend $DATA_BACKEND --num-gpus $NUM_GPUS \ diff --git a/scripts/classification/imagenet/train_imagenet.py b/scripts/classification/imagenet/train_imagenet.py index 08d18e2d50..e02e17ca69 100644 --- a/scripts/classification/imagenet/train_imagenet.py +++ b/scripts/classification/imagenet/train_imagenet.py @@ -12,13 +12,8 @@ from gluoncv.model_zoo import get_model from gluoncv.utils import makedirs, LRSequential, LRScheduler -import dali - -dali_ver = 'DALI_VERSION' -dali_version = os.environ.get(dali_ver) if dali_ver in os.environ else 'nvidia-dali-cuda100' - # CLI -def parse_args(): +def parse_args(logger = None): def float_list(x): return list(map(float, x.split(','))) @@ -120,8 +115,40 @@ def float_list(x): help='a tuple of size 3 for the std rgb') parser.add_argument('--num-training-samples', type=int, default=1281167, help='Number of training samples') - parser = dali.add_dali_args(parser) - dali.add_data_args(parser) + if logger: + # DALI is expected to be used + try: + import dali + except ImportError: + raise ImportError('Unable to import modules dali.py') + + dali_ver = 'DALI_VER' + dali_version = os.environ.get(dali_ver) if dali_ver in os.environ else 'nvidia-dali-cuda100' + stream = os.popen("pip list | grep dali") + output = stream.read() + if output == '': + # DALI is not installed + cmd_install = 'pip install --extra-index-url https://developer.download.nvidia.com/compute/redist ' + dali_version + logger.info('DALI is supposed to be used, but it is not installed.\nWe can try to install it for you (and continue this test) OR\n' \ + 'this test will be stopped and you can later restart it after installing DALI manually') + answer = input('Do you want to install DALI now? (Y/N):') + if answer[0] == 'Y' or answer[0] == 'y': + logger.info('Trying to install DALI version \'%s\'' % dali_version) + ret = os.system(cmd_install) + if ret != 0: + logger.info('Cannot install DALI version \'%s\'.\n' \ + 'Perhaps, the latest DALI version should be used.\n' % dali_version) + else: + ret = 1 + logger.info('To install DALI, please, use:\n' + cmd_install) + if ret != 0: + logger.info('Please, see documentation on ' \ + 'https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html\n' \ + 'and set the environment variable %s to the appropriate version ID (default is \'%s\')' % (dali_ver, dali_version)) + raise RuntimeError('DALI is not installed') + + parser = dali.add_dali_args(parser) + dali.add_data_args(parser) return parser.parse_args() @@ -140,6 +167,8 @@ def batch_func(batch, ctx): logger.setLevel(logging.INFO) logger.addHandler(filehandler) logger.addHandler(streamhandler) + if opt.data_backend[0:5] == 'dali-': + opt = parse_args(logger) # Adding DALI parameters if opt.data_backend == 'dali-gpu' and opt.num_gpus == 0: stream = os.popen('nvidia-smi -L | wc -l') @@ -307,18 +336,7 @@ def get_data_loader(args): # Check if DALI is installed: if args.data_backend[0:5] == 'dali-': - stream = os.popen("pip list | grep dali") - output = stream.read() - if output == '': - # DALI is not installed - logger.info('DALI is not installed\nTrying to install DALI version \'%s\'' % dali_version) - ret = os.system('pip install --extra-index-url https://developer.download.nvidia.com/compute/redist ' + dali_version) - if ret != 0: - logger.info('Cannot install DALI version \'%s\'.\nPerhaps, the latest DALI version should be used.\n' \ - 'Please, see documentation on ' \ - 'https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html\n' \ - 'and set the environment variable %s to the appropriate version ID' % (dali_version, dali_ver)) - raise RuntimeError('DALI is not installed') + import dali if args.data_backend == 'dali-gpu': return (lambda *args, **kwargs: dali.get_rec_iter(*args, **kwargs, batch_fn=batch_func, dali_cpu=False)) if args.data_backend == 'dali-cpu': From 5dccefaad90ab5ee78885a12bbfb4c2c8dd6872b Mon Sep 17 00:00:00 2001 From: andreii Date: Sat, 10 Apr 2021 20:44:33 -0700 Subject: [PATCH 5/5] Changes, suggested by Yin Weisu --- .../classification/imagenet/train_imagenet.py | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/scripts/classification/imagenet/train_imagenet.py b/scripts/classification/imagenet/train_imagenet.py index e02e17ca69..83bab49ad5 100644 --- a/scripts/classification/imagenet/train_imagenet.py +++ b/scripts/classification/imagenet/train_imagenet.py @@ -117,16 +117,10 @@ def float_list(x): help='Number of training samples') if logger: # DALI is expected to be used - try: - import dali - except ImportError: - raise ImportError('Unable to import modules dali.py') - dali_ver = 'DALI_VER' dali_version = os.environ.get(dali_ver) if dali_ver in os.environ else 'nvidia-dali-cuda100' - stream = os.popen("pip list | grep dali") - output = stream.read() - if output == '': + stream = os.popen("pip list --format=columns | grep dali") + if stream.read() == '': # DALI is not installed cmd_install = 'pip install --extra-index-url https://developer.download.nvidia.com/compute/redist ' + dali_version logger.info('DALI is supposed to be used, but it is not installed.\nWe can try to install it for you (and continue this test) OR\n' \ @@ -141,11 +135,15 @@ def float_list(x): else: ret = 1 logger.info('To install DALI, please, use:\n' + cmd_install) - if ret != 0: - logger.info('Please, see documentation on ' \ - 'https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html\n' \ - 'and set the environment variable %s to the appropriate version ID (default is \'%s\')' % (dali_ver, dali_version)) - raise RuntimeError('DALI is not installed') + if ret != 0: + logger.info('Please, see documentation on ' \ + 'https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html\n' \ + 'and set the environment variable %s to the appropriate version ID (default is \'%s\')' % (dali_ver, dali_version)) + raise RuntimeError('DALI is not installed') + try: + import dali + except ImportError: + raise ImportError('Unable to import modules dali.py') parser = dali.add_dali_args(parser) dali.add_data_args(parser)