diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt
index 915bda52a4de69..35c081e9bbc78f 100644
--- a/paddle/fluid/operators/data/CMakeLists.txt
+++ b/paddle/fluid/operators/data/CMakeLists.txt
@@ -4,10 +4,6 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 
-# find_package(ZLIB)
-# include_directories(${ZLIB_INCLUDE_DIRS})
-# TARGET_LINK_LIBRARIES( ${ZLIB_LIBRARIES})
-
 cc_library(pipeline SRCS pipeline.cc DEPS parallel_executor simple_threadpool scope)
 op_library(dataloader_op SRCS dataloader_op.cc dataloader_op.cu.cc DEPS pipeline ${OP_HEADER_DEPS})
 
@@ -31,4 +27,4 @@ op_library(random_flip_op SRCS random_flip_op.cc DEPS ${OP_HEADER_DEPS})
 # register_operators()
 
 # TODO: add test here
-# cc_test(xxx SRCS xxx DEPS xxx)
+# cc_test(xxx SRCS xxx DEPS xxx
\ No newline at end of file
diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc
index 2ca56063936d14..7660f7f3ccb5a7 100644
--- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc
+++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc
@@ -132,6 +132,13 @@ and 255.
         "for optionally converting the image, can be \"unchanged\" "
         ",\"gray\" , \"rgb\" .")
         .SetDefault("unchanged");
+    AddAttr<std::string>(
+        "data_layout",
+        "(string, default NCHW) Only used in "
+        "an optional string from: \"NHWC\", \"NCHW\". "
+        "Specify that the data format of the input and output data is "
+        "channel_first or channel_last.")
+        .SetDefault("NCHW");
     AddAttr<float>("aspect_ratio_min", "").SetDefault(3./4.);
     AddAttr<float>("aspect_ratio_max", "").SetDefault(4./3.);
     AddAttr<float>("area_min", "").SetDefault(0.08);
diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu
index eecf5da9bed9ca..c15e9d0ae3e471 100644
--- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu
+++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu
@@ -16,12 +16,15 @@
 
 #include "paddle/fluid/operators/data/batch_decode_random_crop_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/operators/math/math_function.h"
+// #include "paddle/fluid/operators/transpose_op.h"
 
 namespace paddle {
 namespace operators {
 namespace data {
 
 using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder;
+using DataLayout = framework::DataLayout;
 
 NvjpegDecoderThreadPool* decode_pool = nullptr;
 // std::seed_seq* rand_seq = nullptr;
@@ -50,6 +53,15 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel<T> {
     auto& out_array = *out->GetMutable<framework::LoDTensorArray>();
     out_array.resize(inputs->size());
 
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    framework::LoDTensorArray temp_array;
+    if (data_layout == DataLayout::kNCHW) {
+      temp_array.resize(inputs->size());
+    }
+
     auto aspect_ratio_min = ctx.Attr<float>("aspect_ratio_min");
     auto aspect_ratio_max = ctx.Attr<float>("aspect_ratio_max");
     AspectRatioRange aspect_ratio_range{aspect_ratio_min, aspect_ratio_max};
@@ -66,20 +78,52 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel<T> {
       const framework::LoDTensor x = inputs->at(i);
       auto* x_data = x.data<T>();
       size_t x_numel = static_cast<size_t>(x.numel());
-
-      NvjpegDecodeTask task = {
-        .bit_stream = x_data,
-        .bit_len = x_numel,
-        .tensor = &out_array[i],
-        .roi_generator = new RandomROIGenerator(
-                                aspect_ratio_range, area_range, rands[i]),
-        .place = dev
-      };
-      decode_pool->AddTask(std::make_shared<NvjpegDecodeTask>(task));
+      
+      if (data_layout == DataLayout::kNCHW){
+        NvjpegDecodeTask task = {
+          .bit_stream = x_data,
+          .bit_len = x_numel,
+          .tensor = &temp_array[i],
+          .roi_generator = new RandomROIGenerator(
+                                  aspect_ratio_range, area_range, rands[i]),
+          .place = dev
+        };
+        decode_pool->AddTask(std::make_shared<NvjpegDecodeTask>(task));
+      }
+      else{
+        NvjpegDecodeTask task = {
+          .bit_stream = x_data,
+          .bit_len = x_numel,
+          .tensor = &out_array[i],
+          .roi_generator = new RandomROIGenerator(
+                                  aspect_ratio_range, area_range, rands[i]),
+          .place = dev
+        };
+        decode_pool->AddTask(std::make_shared<NvjpegDecodeTask>(task));
+      }
+      
     }
 
     decode_pool->RunAll(true);
 
+    if (data_layout == DataLayout::kNCHW){
+      const auto& dev_ctx = ctx.cuda_device_context();
+      paddle::operators::math::Transpose<paddle::platform::CUDADeviceContext, T, 3> trans;
+      std::vector<int> axis = {2, 0, 1};
+      // LOG(ERROR) << "start transpose 01!!!";
+      for (size_t i = 0; i < inputs->size(); i++) {
+        // Do transpose
+        const framework::DDim& in_sizes = temp_array[i].dims();
+        // const int ndim = in_sizes.size();
+        framework::DDim transposed_input_shape = in_sizes.transpose(axis);
+        std::vector<int64_t> transposed_input_shape_ =
+            framework::vectorize(transposed_input_shape);
+        out_array[i].Resize(transposed_input_shape);
+        out_array[i].mutable_data<T>(dev_ctx.GetPlace());
+        trans(dev_ctx, temp_array[i], &out_array[i], axis);
+      }
+    }
+
     LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute finish";
   }
 };
diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.h b/paddle/fluid/operators/data/batch_decode_random_crop_op.h
index fd23be38341dc9..de96e38ca95ef9 100644
--- a/paddle/fluid/operators/data/batch_decode_random_crop_op.h
+++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.h
@@ -24,7 +24,6 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/operators/data/nvjpeg_decoder.h"
 
-
 namespace paddle {
 namespace operators {
 namespace data {
diff --git a/paddle/fluid/operators/data/batch_resize_op.cu b/paddle/fluid/operators/data/batch_resize_op.cu
index e2c0319fdcf051..7728a6b4631631 100644
--- a/paddle/fluid/operators/data/batch_resize_op.cu
+++ b/paddle/fluid/operators/data/batch_resize_op.cu
@@ -239,7 +239,11 @@ class BatchResizeCUDAKernel : public framework::OpKernel<T> {
                   img->dims()[0] : img->dims()[2];
 
     std::vector<int64_t> out_dim = {static_cast<int64_t>(x->size()),
+                                    size[0], size[1], img_c};
+    if (data_layout == DataLayout::kNCHW) {
+      out_dim = {static_cast<int64_t>(x->size()),
                                     img_c, size[0], size[1]};
+    }
     out->Resize(framework::make_ddim(out_dim));
     out->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/data/file_label_loader_op.h b/paddle/fluid/operators/data/file_label_loader_op.h
index bbc34a7b546bf1..eef07790372a24 100644
--- a/paddle/fluid/operators/data/file_label_loader_op.h
+++ b/paddle/fluid/operators/data/file_label_loader_op.h
@@ -147,6 +147,7 @@ static void ParseFilesAndLabels(const std::string data_root,
     }
     closedir(dir);
   }
+  
 }
 
 std::map<std::string, std::vector<std::pair<std::string, int>>> root_to_samples_;
@@ -156,6 +157,8 @@ static std::vector<std::pair<std::string, int>>* GetFilesAndLabelsFromCache(cons
   if (iter == root_to_samples_.end()) {
     std::vector<std::pair<std::string, int>> samples;
     ParseFilesAndLabels(data_root, &samples);
+    // std::cout << "files 0: " << samples[0].first << std::endl;
+    // std::cout << "files 1: " << samples[1].first << std::endl;
     LOG(ERROR) << "Init samples: " << samples.size();
     root_to_samples_[data_root] = samples;
   }
diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc
index 784b69b28b05c6..188723ba306fea 100644
--- a/paddle/fluid/operators/data/nvjpeg_decoder.cc
+++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc
@@ -86,6 +86,7 @@ void NvjpegDecoder::CPUDecodeRandomCropResize(const uint8_t* data, size_t length
                                 framework::LoDTensor& temp, framework::LoDTensor* out, platform::Place place) {
   cv::Mat image =
       cv::imdecode(cv::Mat(1, length, CV_8UC1, const_cast<unsigned char*>(data)), cv::IMREAD_COLOR);
+  
   cv::Mat cropped;
   int height;
   int width;
@@ -99,18 +100,21 @@ void NvjpegDecoder::CPUDecodeRandomCropResize(const uint8_t* data, size_t length
     cv_roi.height = roi.h;
     height = roi.h;
     width = roi.w;
-    std::vector<int64_t> out_shape = {3, height, width};
+
+    std::vector<int64_t> out_shape = {height, width, 3};
     temp.Resize(framework::make_ddim(out_shape));
     platform::CPUPlace cpu;
     // allocate memory and assign to out_image
     auto* data = temp.mutable_data<uint8_t>(cpu);
-    cropped.data = data;
+    // todo jianglielin: why not work?
+    // cropped.data = data;
     image(cv_roi).copyTo(cropped);
-    out->Resize(framework::make_ddim(out_shape));
-    
+
+    std::memcpy(data, cropped.data, 3 * height * width);
+
     TensorCopySync(temp, place, out);
-    
   } else {
+    LOG(ERROR) << "Not Use Opencv decode!!!";
     // throw error
   }
 }
@@ -139,6 +143,9 @@ int NvjpegDecoder::ParseDecodeParams(
     return 1;
 #endif
   }
+  else{
+    // LOG(ERROR) << "Use nvjpeg decode!!!";
+  }
 
   int64_t width = static_cast<int64_t>(widths[0]);
   int64_t height = static_cast<int64_t>(heights[0]);
@@ -180,7 +187,7 @@ int NvjpegDecoder::ParseDecodeParams(
     width = roi.w;
   }
 
-  std::vector<int64_t> out_shape = {output_components, height, width};
+  std::vector<int64_t> out_shape = {height, width, output_components};
   out->Resize(framework::make_ddim(out_shape));
 
   // allocate memory and assign to out_image
@@ -217,7 +224,9 @@ void NvjpegDecoder::Run(
   if (res) {
     return;
   }
+  // LOG(ERROR) << "ParseDecodeParams finish !!!";
   Decode(bit_stream, bit_len, &image);
+  // LOG(ERROR) << "Decode finish !!!";
 }
 
 NvjpegDecoderThreadPool::NvjpegDecoderThreadPool(const int num_threads, const std::string mode, const int dev_id)
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index cfdfa456e39eac..0ee26752aebc34 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -50,6 +50,7 @@ template struct SetConstant<platform::CUDADeviceContext,
   template struct Transpose<platform::CUDADeviceContext, float16, RANK>;  \
   template struct Transpose<platform::CUDADeviceContext, bfloat16, RANK>; \
   template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext, uint8_t, RANK>;  \
   template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>;  \
   template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>;  \
   template struct Transpose<platform::CUDADeviceContext,                  \
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index f9084b629c7e54..e3eeceb55a5fc1 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -932,6 +932,7 @@ def image_decode(x, mode='unchanged', num_threads=2, name=None):
 def image_decode_random_crop(x,
                              mode='unchanged',
                              num_threads=2,
+                             data_layout='NCHW',
                              aspect_ratio_min=3./4.,
                              aspect_ratio_max=4./3.,
                              area_min=0.08,
@@ -982,7 +983,7 @@ def image_decode_random_crop(x,
                 core.VarDesc.VarType.LOD_TENSOR_ARRAY, False)
         program_id = utils._hash_with_id(mode, num_threads, name, local_rank)
         return _C_ops.batch_decode_random_crop(
-                x, out, "mode", mode, "num_threads", num_threads,
+                x, out, "mode", mode, "num_threads", num_threads, "data_layout", data_layout,
                 "aspect_ratio_min", aspect_ratio_min,
                 "aspect_ratio_max", aspect_ratio_max,
                 "area_min", area_min, "area_max", area_max,
@@ -992,6 +993,7 @@ def image_decode_random_crop(x,
     inputs = {'X': x}
     attrs = {"mode": mode,
              "num_threads": num_threads,
+             "data_layout": data_layout,
              "aspect_ratio_min": aspect_ratio_min,
              "aspect_ratio_max": aspect_ratio_max,
              "area_min": area_min,