From 8b769e23114b9bab9879dedebfc64b54f2f6af63 Mon Sep 17 00:00:00 2001 From: huangxu96 Date: Wed, 11 Nov 2020 08:35:44 +0000 Subject: [PATCH 1/2] Add fp16 training for ResNeXt101 --- PaddleCV/image_classification/build_model.py | 37 +++++++-------- .../image_classification/models/resnext.py | 45 ++++++++++++------- .../scripts/train/ResNet50_fp16.sh | 6 +-- PaddleCV/image_classification/train.py | 13 ++++++ .../image_classification/utils/utility.py | 3 +- 5 files changed, 66 insertions(+), 38 deletions(-) diff --git a/PaddleCV/image_classification/build_model.py b/PaddleCV/image_classification/build_model.py index 3f7a3a8075..5b45432924 100644 --- a/PaddleCV/image_classification/build_model.py +++ b/PaddleCV/image_classification/build_model.py @@ -35,15 +35,16 @@ def _calc_label_smoothing_loss(softmax_out, label, class_dim, epsilon): def _basic_model(data, model, args, is_train): image = data[0] label = data[1] - if args.model == "ResNet50": - image_in = fluid.layers.transpose( - image, [0, 2, 3, 1]) if args.data_format == 'NHWC' else image - image_in.stop_gradient = image.stop_gradient - net_out = model.net(input=image_in, - class_dim=args.class_dim, - data_format=args.data_format) - else: - net_out = model.net(input=image, class_dim=args.class_dim) + print ("args.data_format:", args.data_format) + # if args.model == ("ResNet50" or "ResNeXt101_32x4d"): + image_in = fluid.layers.transpose( + image, [0, 2, 3, 1]) if args.data_format == 'NHWC' else image + image_in.stop_gradient = image.stop_gradient + net_out = model.net(input=image_in, + class_dim=args.class_dim, + data_format=args.data_format) + # else: + # net_out = model.net(input=image, class_dim=args.class_dim) softmax_out = fluid.layers.softmax(net_out, use_cudnn=False) if is_train and args.use_label_smoothing: @@ -95,15 +96,15 @@ def _mixup_model(data, model, args, is_train): y_b = data[2] lam = data[3] - if args.model == "ResNet50": - image_in = fluid.layers.transpose( - image, [0, 2, 3, 1]) if args.data_format == 'NHWC' else image - image_in.stop_gradient = image.stop_gradient - net_out = model.net(input=image_in, - class_dim=args.class_dim, - data_format=args.data_format) - else: - net_out = model.net(input=image, class_dim=args.class_dim) + # if args.model == "ResNet50": + image_in = fluid.layers.transpose( + image, [0, 2, 3, 1]) if args.data_format == 'NHWC' else image + image_in.stop_gradient = image.stop_gradient + net_out = model.net(input=image_in, + class_dim=args.class_dim, + data_format=args.data_format) + # else: + # net_out = model.net(input=image, class_dim=args.class_dim) softmax_out = fluid.layers.softmax(net_out, use_cudnn=False) if not args.use_label_smoothing: loss_a = fluid.layers.cross_entropy(input=softmax_out, label=y_a) diff --git a/PaddleCV/image_classification/models/resnext.py b/PaddleCV/image_classification/models/resnext.py index ad973387fe..e6918b177b 100644 --- a/PaddleCV/image_classification/models/resnext.py +++ b/PaddleCV/image_classification/models/resnext.py @@ -33,7 +33,7 @@ def __init__(self, layers=50, cardinality=64): self.layers = layers self.cardinality = cardinality - def net(self, input, class_dim=1000): + def net(self, input, class_dim=1000, data_format="NCHW"): layers = self.layers cardinality = self.cardinality supported_layers = [50, 101, 152] @@ -56,13 +56,15 @@ def net(self, input, class_dim=1000): filter_size=7, stride=2, act='relu', - name="res_conv1") #debug + name="res_conv1", #debug + data_format=data_format) conv = fluid.layers.pool2d( input=conv, pool_size=3, pool_stride=2, pool_padding=1, - pool_type='max') + pool_type='max', + data_format=data_format) for block in range(len(depth)): for i in range(depth[block]): @@ -79,10 +81,11 @@ def net(self, input, class_dim=1000): if cardinality == 64 else num_filters2[block], stride=2 if i == 0 and block != 0 else 1, cardinality=cardinality, - name=conv_name) + name=conv_name, + data_format=data_format) pool = fluid.layers.pool2d( - input=conv, pool_type='avg', global_pooling=True) + input=conv, pool_type='avg', global_pooling=True, data_format=data_format) stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) out = fluid.layers.fc( input=pool, @@ -100,7 +103,8 @@ def conv_bn_layer(self, stride=1, groups=1, act=None, - name=None): + name=None, + data_format='NCHW'): conv = fluid.layers.conv2d( input=input, num_filters=num_filters, @@ -111,7 +115,8 @@ def conv_bn_layer(self, act=None, param_attr=ParamAttr(name=name + "_weights"), bias_attr=False, - name=name + '.conv2d.output.1') + name=name + '.conv2d.output.1', + data_format=data_format) if name == "conv1": bn_name = "bn_" + name else: @@ -123,23 +128,28 @@ def conv_bn_layer(self, param_attr=ParamAttr(name=bn_name + '_scale'), bias_attr=ParamAttr(bn_name + '_offset'), moving_mean_name=bn_name + '_mean', - moving_variance_name=bn_name + '_variance', ) + moving_variance_name=bn_name + '_variance', + data_layout=data_format) - def shortcut(self, input, ch_out, stride, name): - ch_in = input.shape[1] + def shortcut(self, input, ch_out, stride, name, data_format): + if data_format == "NCHW": + ch_in = input.shape[1] + else: + ch_in = input.shape[-1] if ch_in != ch_out or stride != 1: - return self.conv_bn_layer(input, ch_out, 1, stride, name=name) + return self.conv_bn_layer(input, ch_out, 1, stride, name=name, data_format=data_format) else: return input - def bottleneck_block(self, input, num_filters, stride, cardinality, name): + def bottleneck_block(self, input, num_filters, stride, cardinality, name, data_format): cardinality = self.cardinality conv0 = self.conv_bn_layer( input=input, num_filters=num_filters, filter_size=1, act='relu', - name=name + "_branch2a") + name=name + "_branch2a", + data_format=data_format) conv1 = self.conv_bn_layer( input=conv0, num_filters=num_filters, @@ -147,19 +157,22 @@ def bottleneck_block(self, input, num_filters, stride, cardinality, name): stride=stride, groups=cardinality, act='relu', - name=name + "_branch2b") + name=name + "_branch2b", + data_format=data_format) conv2 = self.conv_bn_layer( input=conv1, num_filters=num_filters if cardinality == 64 else num_filters * 2, filter_size=1, act=None, - name=name + "_branch2c") + name=name + "_branch2c", + data_format=data_format) short = self.shortcut( input, num_filters if cardinality == 64 else num_filters * 2, stride, - name=name + "_branch1") + name=name + "_branch1", + data_format=data_format) return fluid.layers.elementwise_add( x=short, y=conv2, act='relu', name=name + ".add.output.5") diff --git a/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh b/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh index 3a4090c1c4..456caa46a7 100755 --- a/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh +++ b/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh @@ -1,11 +1,11 @@ #!/bin/bash -ex +export CUDA_VISIBLE_DEVICES=4 export FLAGS_conv_workspace_size_limit=4000 #MB export FLAGS_cudnn_exhaustive_search=1 export FLAGS_cudnn_batchnorm_spatial_persistent=1 - -DATA_DIR="Your image dataset path, e.g. /work/datasets/ILSVRC2012/" +DATA_DIR="/ssd3/datasets/ILSVRC2012" DATA_FORMAT="NHWC" USE_FP16=true #whether to use float16 @@ -23,7 +23,7 @@ fi python train.py \ --model=ResNet50 \ --data_dir=${DATA_DIR} \ - --batch_size=256 \ + --batch_size=128 \ --total_images=1281167 \ --image_shape 4 224 224 \ --class_dim=1000 \ diff --git a/PaddleCV/image_classification/train.py b/PaddleCV/image_classification/train.py index 59ae6983ce..68728177a1 100755 --- a/PaddleCV/image_classification/train.py +++ b/PaddleCV/image_classification/train.py @@ -264,8 +264,18 @@ def train(args): test_iter = test_data_loader() batch_start = time.time() + ips_avg = [] for batch in train_iter: #NOTE: this is for benchmark + + # if total_batch_num == 200: + # fluid.core.nvprof_start() + # if total_batch_num == 210: + # fluid.core.nvprof_stop() + if total_batch_num == 200: + print(">>>>>>>>>>>>>>>>>>>>>>>>>> Average ips: ", np.mean(ips_avg),">>>>>>>>>>>>>>>>>>>>>>>>") + #fluid.core.nvprof_stop() + return if args.max_iter and total_batch_num == args.max_iter: return reader_cost_averager.record(time.time() - batch_start) @@ -285,6 +295,9 @@ def train(args): if trainer_id == 0: ips = float(args.batch_size) / batch_cost_averager.get_average() + if total_batch_num > 40: + ips_avg.append(ips) + print_info( "batch", train_batch_metrics_avg, diff --git a/PaddleCV/image_classification/utils/utility.py b/PaddleCV/image_classification/utils/utility.py index bef2214747..0b7b080bb4 100644 --- a/PaddleCV/image_classification/utils/utility.py +++ b/PaddleCV/image_classification/utils/utility.py @@ -200,7 +200,8 @@ def check_version(): "Please make sure the version is good with your code." \ try: - fluid.require_version('1.6.0') + a = 1 + #fluid.require_version('1.6.0') except Exception as e: logger.error(err) sys.exit(1) From b3509b81e6d8e482bc6e78222d155ec620b36fdd Mon Sep 17 00:00:00 2001 From: huangxu96 Date: Thu, 12 Nov 2020 03:57:15 +0000 Subject: [PATCH 2/2] Added training script --- .../scripts/train/ResNeXt101_32x4d_fp16.sh | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100755 PaddleCV/image_classification/scripts/train/ResNeXt101_32x4d_fp16.sh diff --git a/PaddleCV/image_classification/scripts/train/ResNeXt101_32x4d_fp16.sh b/PaddleCV/image_classification/scripts/train/ResNeXt101_32x4d_fp16.sh new file mode 100755 index 0000000000..ee760991cd --- /dev/null +++ b/PaddleCV/image_classification/scripts/train/ResNeXt101_32x4d_fp16.sh @@ -0,0 +1,49 @@ +#!/bin/bash -ex + +export CUDA_VISIBLE_DEVICES=5 +export FLAGS_conv_workspace_size_limit=4000 #MB +export FLAGS_cudnn_exhaustive_search=0 +export FLAGS_cudnn_batchnorm_spatial_persistent=1 + +DATA_DIR="/ssd3/datasets/ILSVRC2012" + +DATA_FORMAT="NHWC" +USE_FP16=false #whether to use float16 +USE_DALI=true +USE_ADDTO=true + +if ${USE_ADDTO} ;then + export FLAGS_max_inplace_grad_add=8 +fi + +if ${USE_DALI}; then + export FLAGS_fraction_of_gpu_memory_to_use=0.8 +fi + +nvprof -o timeline_output -f --cpu-profiling off --profile-from-start off python train.py \ + --model=ResNeXt101_32x4d \ + --data_dir=${DATA_DIR} \ + --batch_size=32 \ + --total_images=1281167 \ + --image_shape 4 224 224 \ + --class_dim=1000 \ + --print_step=10 \ + --model_save_dir=output/ \ + --lr_strategy=piecewise_decay \ + --use_fp16=${USE_FP16} \ + --scale_loss=128.0 \ + --use_dynamic_loss_scaling=true \ + --data_format=${DATA_FORMAT} \ + --fuse_elewise_add_act_ops=true \ + --fuse_bn_act_ops=true \ + --fuse_bn_add_act_ops=true \ + --enable_addto=${USE_ADDTO} \ + --validate=true \ + --is_profiler=false \ + --profiler_path=profile/ \ + --reader_thread=10 \ + --reader_buf_size=4000 \ + --use_dali=${USE_DALI} \ + --lr=0.1 + +