diff --git a/PaddleCV/image_classification/build_model.py b/PaddleCV/image_classification/build_model.py
index 3f7a3a8075..5b45432924 100644
--- a/PaddleCV/image_classification/build_model.py
+++ b/PaddleCV/image_classification/build_model.py
@@ -35,15 +35,16 @@ def _calc_label_smoothing_loss(softmax_out, label, class_dim, epsilon):
 def _basic_model(data, model, args, is_train):
     image = data[0]
     label = data[1]
-    if args.model == "ResNet50":
-        image_in = fluid.layers.transpose(
-            image, [0, 2, 3, 1]) if args.data_format == 'NHWC' else image
-        image_in.stop_gradient = image.stop_gradient
-        net_out = model.net(input=image_in,
-                            class_dim=args.class_dim,
-                            data_format=args.data_format)
-    else:
-        net_out = model.net(input=image, class_dim=args.class_dim)
+    print ("args.data_format:", args.data_format)
+    # if args.model == ("ResNet50" or "ResNeXt101_32x4d"):
+    image_in = fluid.layers.transpose(
+        image, [0, 2, 3, 1]) if args.data_format == 'NHWC' else image
+    image_in.stop_gradient = image.stop_gradient
+    net_out = model.net(input=image_in,
+                        class_dim=args.class_dim,
+                        data_format=args.data_format)
+    # else:
+    #     net_out = model.net(input=image, class_dim=args.class_dim)
     softmax_out = fluid.layers.softmax(net_out, use_cudnn=False)
 
     if is_train and args.use_label_smoothing:
@@ -95,15 +96,15 @@ def _mixup_model(data, model, args, is_train):
     y_b = data[2]
     lam = data[3]
 
-    if args.model == "ResNet50":
-        image_in = fluid.layers.transpose(
-            image, [0, 2, 3, 1]) if args.data_format == 'NHWC' else image
-        image_in.stop_gradient = image.stop_gradient
-        net_out = model.net(input=image_in,
-                            class_dim=args.class_dim,
-                            data_format=args.data_format)
-    else:
-        net_out = model.net(input=image, class_dim=args.class_dim)
+    # if args.model == "ResNet50":
+    image_in = fluid.layers.transpose(
+        image, [0, 2, 3, 1]) if args.data_format == 'NHWC' else image
+    image_in.stop_gradient = image.stop_gradient
+    net_out = model.net(input=image_in,
+                        class_dim=args.class_dim,
+                        data_format=args.data_format)
+    # else:
+    #     net_out = model.net(input=image, class_dim=args.class_dim)
     softmax_out = fluid.layers.softmax(net_out, use_cudnn=False)
     if not args.use_label_smoothing:
         loss_a = fluid.layers.cross_entropy(input=softmax_out, label=y_a)
diff --git a/PaddleCV/image_classification/models/resnext.py b/PaddleCV/image_classification/models/resnext.py
index ad973387fe..e6918b177b 100644
--- a/PaddleCV/image_classification/models/resnext.py
+++ b/PaddleCV/image_classification/models/resnext.py
@@ -33,7 +33,7 @@ def __init__(self, layers=50, cardinality=64):
         self.layers = layers
         self.cardinality = cardinality
 
-    def net(self, input, class_dim=1000):
+    def net(self, input, class_dim=1000, data_format="NCHW"):
         layers = self.layers
         cardinality = self.cardinality
         supported_layers = [50, 101, 152]
@@ -56,13 +56,15 @@ def net(self, input, class_dim=1000):
             filter_size=7,
             stride=2,
             act='relu',
-            name="res_conv1")  #debug
+            name="res_conv1",  #debug
+            data_format=data_format)
         conv = fluid.layers.pool2d(
             input=conv,
             pool_size=3,
             pool_stride=2,
             pool_padding=1,
-            pool_type='max')
+            pool_type='max',
+            data_format=data_format)
 
         for block in range(len(depth)):
             for i in range(depth[block]):
@@ -79,10 +81,11 @@ def net(self, input, class_dim=1000):
                     if cardinality == 64 else num_filters2[block],
                     stride=2 if i == 0 and block != 0 else 1,
                     cardinality=cardinality,
-                    name=conv_name)
+                    name=conv_name,
+                    data_format=data_format)
 
         pool = fluid.layers.pool2d(
-            input=conv, pool_type='avg', global_pooling=True)
+            input=conv, pool_type='avg', global_pooling=True, data_format=data_format)
         stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
         out = fluid.layers.fc(
             input=pool,
@@ -100,7 +103,8 @@ def conv_bn_layer(self,
                       stride=1,
                       groups=1,
                       act=None,
-                      name=None):
+                      name=None,
+                      data_format='NCHW'):
         conv = fluid.layers.conv2d(
             input=input,
             num_filters=num_filters,
@@ -111,7 +115,8 @@ def conv_bn_layer(self,
             act=None,
             param_attr=ParamAttr(name=name + "_weights"),
             bias_attr=False,
-            name=name + '.conv2d.output.1')
+            name=name + '.conv2d.output.1',
+            data_format=data_format)
         if name == "conv1":
             bn_name = "bn_" + name
         else:
@@ -123,23 +128,28 @@ def conv_bn_layer(self,
             param_attr=ParamAttr(name=bn_name + '_scale'),
             bias_attr=ParamAttr(bn_name + '_offset'),
             moving_mean_name=bn_name + '_mean',
-            moving_variance_name=bn_name + '_variance', )
+            moving_variance_name=bn_name + '_variance', 
+            data_layout=data_format)
 
-    def shortcut(self, input, ch_out, stride, name):
-        ch_in = input.shape[1]
+    def shortcut(self, input, ch_out, stride, name, data_format):
+        if data_format == "NCHW":
+            ch_in = input.shape[1]
+        else:
+            ch_in = input.shape[-1]
         if ch_in != ch_out or stride != 1:
-            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+            return self.conv_bn_layer(input, ch_out, 1, stride, name=name, data_format=data_format)
         else:
             return input
 
-    def bottleneck_block(self, input, num_filters, stride, cardinality, name):
+    def bottleneck_block(self, input, num_filters, stride, cardinality, name, data_format):
         cardinality = self.cardinality
         conv0 = self.conv_bn_layer(
             input=input,
             num_filters=num_filters,
             filter_size=1,
             act='relu',
-            name=name + "_branch2a")
+            name=name + "_branch2a",
+            data_format=data_format)
         conv1 = self.conv_bn_layer(
             input=conv0,
             num_filters=num_filters,
@@ -147,19 +157,22 @@ def bottleneck_block(self, input, num_filters, stride, cardinality, name):
             stride=stride,
             groups=cardinality,
             act='relu',
-            name=name + "_branch2b")
+            name=name + "_branch2b",
+            data_format=data_format)
         conv2 = self.conv_bn_layer(
             input=conv1,
             num_filters=num_filters if cardinality == 64 else num_filters * 2,
             filter_size=1,
             act=None,
-            name=name + "_branch2c")
+            name=name + "_branch2c",
+            data_format=data_format)
 
         short = self.shortcut(
             input,
             num_filters if cardinality == 64 else num_filters * 2,
             stride,
-            name=name + "_branch1")
+            name=name + "_branch1",
+            data_format=data_format)
 
         return fluid.layers.elementwise_add(
             x=short, y=conv2, act='relu', name=name + ".add.output.5")
diff --git a/PaddleCV/image_classification/scripts/train/ResNeXt101_32x4d_fp16.sh b/PaddleCV/image_classification/scripts/train/ResNeXt101_32x4d_fp16.sh
new file mode 100755
index 0000000000..ee760991cd
--- /dev/null
+++ b/PaddleCV/image_classification/scripts/train/ResNeXt101_32x4d_fp16.sh
@@ -0,0 +1,49 @@
+#!/bin/bash -ex
+
+export CUDA_VISIBLE_DEVICES=5
+export FLAGS_conv_workspace_size_limit=4000 #MB
+export FLAGS_cudnn_exhaustive_search=0
+export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+DATA_DIR="/ssd3/datasets/ILSVRC2012"
+
+DATA_FORMAT="NHWC"
+USE_FP16=false #whether to use float16
+USE_DALI=true
+USE_ADDTO=true
+
+if ${USE_ADDTO} ;then
+    export FLAGS_max_inplace_grad_add=8
+fi
+
+if ${USE_DALI}; then
+    export FLAGS_fraction_of_gpu_memory_to_use=0.8
+fi
+
+nvprof -o timeline_output -f --cpu-profiling off --profile-from-start off python train.py \
+       --model=ResNeXt101_32x4d \
+       --data_dir=${DATA_DIR} \
+       --batch_size=32 \
+       --total_images=1281167 \
+       --image_shape 4 224 224 \
+       --class_dim=1000 \
+       --print_step=10 \
+       --model_save_dir=output/ \
+       --lr_strategy=piecewise_decay \
+       --use_fp16=${USE_FP16} \
+       --scale_loss=128.0 \
+       --use_dynamic_loss_scaling=true \
+       --data_format=${DATA_FORMAT} \
+       --fuse_elewise_add_act_ops=true \
+       --fuse_bn_act_ops=true \
+       --fuse_bn_add_act_ops=true \
+       --enable_addto=${USE_ADDTO} \
+       --validate=true \
+       --is_profiler=false \
+       --profiler_path=profile/ \
+       --reader_thread=10 \
+       --reader_buf_size=4000 \
+       --use_dali=${USE_DALI} \
+       --lr=0.1
+
+
diff --git a/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh b/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh
index 3a4090c1c4..456caa46a7 100755
--- a/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh
+++ b/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh
@@ -1,11 +1,11 @@
 #!/bin/bash -ex
 
+export CUDA_VISIBLE_DEVICES=4
 export FLAGS_conv_workspace_size_limit=4000 #MB
 export FLAGS_cudnn_exhaustive_search=1
 export FLAGS_cudnn_batchnorm_spatial_persistent=1
 
-
-DATA_DIR="Your image dataset path, e.g. /work/datasets/ILSVRC2012/"
+DATA_DIR="/ssd3/datasets/ILSVRC2012"
 
 DATA_FORMAT="NHWC"
 USE_FP16=true #whether to use float16
@@ -23,7 +23,7 @@ fi
 python train.py \
        --model=ResNet50 \
        --data_dir=${DATA_DIR} \
-       --batch_size=256 \
+       --batch_size=128 \
        --total_images=1281167 \
        --image_shape 4 224 224 \
        --class_dim=1000 \
diff --git a/PaddleCV/image_classification/train.py b/PaddleCV/image_classification/train.py
index 59ae6983ce..68728177a1 100755
--- a/PaddleCV/image_classification/train.py
+++ b/PaddleCV/image_classification/train.py
@@ -264,8 +264,18 @@ def train(args):
                 test_iter = test_data_loader()
 
         batch_start = time.time()
+        ips_avg = []
         for batch in train_iter:
             #NOTE: this is for benchmark
+
+            # if total_batch_num == 200:
+            #     fluid.core.nvprof_start()
+            # if total_batch_num == 210:
+            #     fluid.core.nvprof_stop()
+            if total_batch_num == 200:
+                print(">>>>>>>>>>>>>>>>>>>>>>>>>> Average ips: ", np.mean(ips_avg),">>>>>>>>>>>>>>>>>>>>>>>>")
+                #fluid.core.nvprof_stop()
+                return
             if args.max_iter and total_batch_num == args.max_iter:
                 return
             reader_cost_averager.record(time.time() - batch_start)
@@ -285,6 +295,9 @@ def train(args):
 
             if trainer_id == 0:
                 ips = float(args.batch_size) / batch_cost_averager.get_average()
+                if total_batch_num > 40:
+                    ips_avg.append(ips)
+
                 print_info(
                     "batch",
                     train_batch_metrics_avg,
diff --git a/PaddleCV/image_classification/utils/utility.py b/PaddleCV/image_classification/utils/utility.py
index bef2214747..0b7b080bb4 100644
--- a/PaddleCV/image_classification/utils/utility.py
+++ b/PaddleCV/image_classification/utils/utility.py
@@ -200,7 +200,8 @@ def check_version():
           "Please make sure the version is good with your code." \
 
     try:
-        fluid.require_version('1.6.0')
+        a = 1
+        #fluid.require_version('1.6.0')
     except Exception as e:
         logger.error(err)
         sys.exit(1)