From f482564e1ec3b7fd1c97e2406632c37bf71e59d1 Mon Sep 17 00:00:00 2001
From: LielinJiang <jianglielin@baidu.com>
Date: Mon, 14 Feb 2022 06:31:08 +0000
Subject: [PATCH 1/2] fix decode error and add layout for decode op

---
 paddle/fluid/operators/data/CMakeLists.txt    |  7 +-
 .../data/batch_decode_random_crop_op.cc       |  7 ++
 .../data/batch_decode_random_crop_op.cu       | 64 ++++++++++++++++---
 .../data/batch_decode_random_crop_op.h        | 37 ++++++++++-
 .../fluid/operators/data/batch_resize_op.cu   | 17 +++++
 .../operators/data/file_label_loader_op.h     |  3 +
 paddle/fluid/operators/data/nvjpeg_decoder.cc | 28 ++++++--
 paddle/fluid/operators/math/math_function.cu  |  1 +
 python/paddle/vision/ops.py                   |  4 +-
 9 files changed, 144 insertions(+), 24 deletions(-)
diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt
index bf6470bd02df3..f16b73ca17272 100644
--- a/paddle/fluid/operators/data/CMakeLists.txt
+++ b/paddle/fluid/operators/data/CMakeLists.txt
@@ -4,10 +4,6 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 
-# find_package(ZLIB)
-# include_directories(${ZLIB_INCLUDE_DIRS})
-# TARGET_LINK_LIBRARIES( ${ZLIB_LIBRARIES})
-
 cc_library(pipeline SRCS pipeline.cc DEPS parallel_executor simple_threadpool scope)
 op_library(dataloader_op SRCS dataloader_op.cc dataloader_op.cu.cc DEPS pipeline ${OP_HEADER_DEPS})
 
@@ -23,9 +19,10 @@ op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS nvjpe
 
 op_library(random_crop_and_resize_op SRCS random_crop_and_resize_op.cc random_crop_and_resize_op.cu DEPS ${OP_HEADER_DEPS})
 op_library(batch_resize_op SRCS batch_resize_op.cc batch_resize_op.cu DEPS ${OP_HEADER_DEPS})
+
 op_library(file_label_loader_op SRCS file_label_loader_op.cc DEPS ${OP_HEADER_DEPS})
 
 # register_operators()
 
 # TODO: add test here
-# cc_test(xxx SRCS xxx DEPS xxx)
+# cc_test(xxx SRCS xxx DEPS xxx
\ No newline at end of file
diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc
index 2ca56063936d1..7660f7f3ccb5a 100644
--- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc
+++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc
@@ -132,6 +132,13 @@ and 255.
         "for optionally converting the image, can be \"unchanged\" "
         ",\"gray\" , \"rgb\" .")
         .SetDefault("unchanged");
+    AddAttr<std::string>(
+        "data_layout",
+        "(string, default NCHW) Only used in "
+        "an optional string from: \"NHWC\", \"NCHW\". "
+        "Specify that the data format of the input and output data is "
+        "channel_first or channel_last.")
+        .SetDefault("NCHW");
     AddAttr<float>("aspect_ratio_min", "").SetDefault(3./4.);
     AddAttr<float>("aspect_ratio_max", "").SetDefault(4./3.);
     AddAttr<float>("area_min", "").SetDefault(0.08);
diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu
index eecf5da9bed9c..c15e9d0ae3e47 100644
--- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu
+++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu
@@ -16,12 +16,15 @@
 
 #include "paddle/fluid/operators/data/batch_decode_random_crop_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/operators/math/math_function.h"
+// #include "paddle/fluid/operators/transpose_op.h"
 
 namespace paddle {
 namespace operators {
 namespace data {
 
 using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder;
+using DataLayout = framework::DataLayout;
 
 NvjpegDecoderThreadPool* decode_pool = nullptr;
 // std::seed_seq* rand_seq = nullptr;
@@ -50,6 +53,15 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel<T> {
     auto& out_array = *out->GetMutable<framework::LoDTensorArray>();
     out_array.resize(inputs->size());
 
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    framework::LoDTensorArray temp_array;
+    if (data_layout == DataLayout::kNCHW) {
+      temp_array.resize(inputs->size());
+    }
+
     auto aspect_ratio_min = ctx.Attr<float>("aspect_ratio_min");
     auto aspect_ratio_max = ctx.Attr<float>("aspect_ratio_max");
     AspectRatioRange aspect_ratio_range{aspect_ratio_min, aspect_ratio_max};
@@ -66,20 +78,52 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel<T> {
       const framework::LoDTensor x = inputs->at(i);
       auto* x_data = x.data<T>();
       size_t x_numel = static_cast<size_t>(x.numel());
-
-      NvjpegDecodeTask task = {
-        .bit_stream = x_data,
-        .bit_len = x_numel,
-        .tensor = &out_array[i],
-        .roi_generator = new RandomROIGenerator(
-                                aspect_ratio_range, area_range, rands[i]),
-        .place = dev
-      };
-      decode_pool->AddTask(std::make_shared<NvjpegDecodeTask>(task));
+      
+      if (data_layout == DataLayout::kNCHW){
+        NvjpegDecodeTask task = {
+          .bit_stream = x_data,
+          .bit_len = x_numel,
+          .tensor = &temp_array[i],
+          .roi_generator = new RandomROIGenerator(
+                                  aspect_ratio_range, area_range, rands[i]),
+          .place = dev
+        };
+        decode_pool->AddTask(std::make_shared<NvjpegDecodeTask>(task));
+      }
+      else{
+        NvjpegDecodeTask task = {
+          .bit_stream = x_data,
+          .bit_len = x_numel,
+          .tensor = &out_array[i],
+          .roi_generator = new RandomROIGenerator(
+                                  aspect_ratio_range, area_range, rands[i]),
+          .place = dev
+        };
+        decode_pool->AddTask(std::make_shared<NvjpegDecodeTask>(task));
+      }
+      
     }
 
     decode_pool->RunAll(true);
 
+    if (data_layout == DataLayout::kNCHW){
+      const auto& dev_ctx = ctx.cuda_device_context();
+      paddle::operators::math::Transpose<paddle::platform::CUDADeviceContext, T, 3> trans;
+      std::vector<int> axis = {2, 0, 1};
+      // LOG(ERROR) << "start transpose 01!!!";
+      for (size_t i = 0; i < inputs->size(); i++) {
+        // Do transpose
+        const framework::DDim& in_sizes = temp_array[i].dims();
+        // const int ndim = in_sizes.size();
+        framework::DDim transposed_input_shape = in_sizes.transpose(axis);
+        std::vector<int64_t> transposed_input_shape_ =
+            framework::vectorize(transposed_input_shape);
+        out_array[i].Resize(transposed_input_shape);
+        out_array[i].mutable_data<T>(dev_ctx.GetPlace());
+        trans(dev_ctx, temp_array[i], &out_array[i], axis);
+      }
+    }
+
     LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute finish";
   }
 };
diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.h b/paddle/fluid/operators/data/batch_decode_random_crop_op.h
index fd23be38341dc..f599c74a7dfb2 100644
--- a/paddle/fluid/operators/data/batch_decode_random_crop_op.h
+++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.h
@@ -24,11 +24,46 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/operators/data/nvjpeg_decoder.h"
 
-
 namespace paddle {
 namespace operators {
 namespace data {
 
+// template <typename DeviceContext, typename T>
+// void TransCompute(const int dim, const DeviceContext& dev_ctx,
+//                          const framework::Tensor& in, framework::Tensor* out,
+//                          const std::vector<int>& axis) {
+//   switch (dim) {
+//     case 1:
+//       math::Transpose<DeviceContext, T, 1> trans1;
+//       trans1(dev_ctx, in, out, axis);
+//       break;
+//     case 2:
+//       math::Transpose<DeviceContext, T, 2> trans2;
+//       trans2(dev_ctx, in, out, axis);
+//       break;
+//     case 3:
+//       math::Transpose<DeviceContext, T, 3> trans3;
+//       trans3(dev_ctx, in, out, axis);
+//       break;
+//     case 4:
+//       math::Transpose<DeviceContext, T, 4> trans4;
+//       trans4(dev_ctx, in, out, axis);
+//       break;
+//     case 5:
+//       math::Transpose<DeviceContext, T, 5> trans5;
+//       trans5(dev_ctx, in, out, axis);
+//       break;
+//     case 6:
+//       math::Transpose<DeviceContext, T, 6> trans6;
+//       trans6(dev_ctx, in, out, axis);
+//       break;
+//     default:
+//       // for dim >= 7 situation
+//       math::TransposeNormal<DeviceContext, T> trans_normal;
+//       trans_normal(dev_ctx, in, out, axis);
+//   }
+// }
+
 template <typename T>
 class CPUBatchDecodeRandomCropKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/data/batch_resize_op.cu b/paddle/fluid/operators/data/batch_resize_op.cu
index e2c0319fdcf05..f7a7f52a3703e 100644
--- a/paddle/fluid/operators/data/batch_resize_op.cu
+++ b/paddle/fluid/operators/data/batch_resize_op.cu
@@ -234,15 +234,32 @@ class BatchResizeCUDAKernel : public framework::OpKernel<T> {
     bool align_corners = ctx.Attr<bool>("align_corners");
     int align_mode = ctx.Attr<int>("align_mode");
 
+    // int img_h, img_w;//, idx_h, idx_w, crop_h, crop_w;
+
     auto* img = &x->at(0);
     int64_t img_c = data_layout == DataLayout::kNCHW ? \
                   img->dims()[0] : img->dims()[2];
 
+    LOG(ERROR) << "img channel: " << img_c << " || " << data_layout_str;
+    // img_h =
+    //     data_layout == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0];
+    // img_w =
+    //     data_layout == DataLayout::kNCHW ? img->dims()[2] : img->dims()[1];
+
     std::vector<int64_t> out_dim = {static_cast<int64_t>(x->size()),
+                                    size[0], size[1], img_c};
+    if (data_layout == DataLayout::kNCHW) {
+      out_dim = {static_cast<int64_t>(x->size()),
                                     img_c, size[0], size[1]};
+    }
     out->Resize(framework::make_ddim(out_dim));
     out->mutable_data<T>(ctx.GetPlace());
 
+    // for (int i = 0; i < x->size(); i++) {
+    //   img = &x->at(i);
+    //   auto out_tensor = out->Slice(i, i + 1);
+    //   TensorCopySync(*img, ctx.GetPlace(), &out_tensor);
+    // }
     int img_h, img_w, idx_h, idx_w, crop_h, crop_w;
     for (int i = 0; i < x->size(); i++) {
       img = &x->at(i);
diff --git a/paddle/fluid/operators/data/file_label_loader_op.h b/paddle/fluid/operators/data/file_label_loader_op.h
index bbc34a7b546bf..ffb08a6439b17 100644
--- a/paddle/fluid/operators/data/file_label_loader_op.h
+++ b/paddle/fluid/operators/data/file_label_loader_op.h
@@ -147,6 +147,7 @@ static void ParseFilesAndLabels(const std::string data_root,
     }
     closedir(dir);
   }
+  
 }
 
 std::map<std::string, std::vector<std::pair<std::string, int>>> root_to_samples_;
@@ -156,6 +157,8 @@ static std::vector<std::pair<std::string, int>>* GetFilesAndLabelsFromCache(cons
   if (iter == root_to_samples_.end()) {
     std::vector<std::pair<std::string, int>> samples;
     ParseFilesAndLabels(data_root, &samples);
+    std::cout << "files 0: " << samples[0].first << std::endl;
+    std::cout << "files 1: " << samples[1].first << std::endl;
     LOG(ERROR) << "Init samples: " << samples.size();
     root_to_samples_[data_root] = samples;
   }
diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc
index 784b69b28b05c..56159f81e51f6 100644
--- a/paddle/fluid/operators/data/nvjpeg_decoder.cc
+++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc
@@ -85,7 +85,9 @@ void NvjpegDecoder::CPUDecodeRandomCropResize(const uint8_t* data, size_t length
                                 unsigned char* workspace, size_t workspace_size,
                                 framework::LoDTensor& temp, framework::LoDTensor* out, platform::Place place) {
   cv::Mat image =
+      // cv::imdecode(const_cast<unsigned char*>(data), cv::IMREAD_COLOR);
       cv::imdecode(cv::Mat(1, length, CV_8UC1, const_cast<unsigned char*>(data)), cv::IMREAD_COLOR);
+  
   cv::Mat cropped;
   int height;
   int width;
@@ -93,24 +95,28 @@ void NvjpegDecoder::CPUDecodeRandomCropResize(const uint8_t* data, size_t length
     ROI roi;
     roi_generator->GenerateRandomROI(image.cols, image.rows, &roi);
     cv::Rect cv_roi;
+
     cv_roi.x = roi.x;
     cv_roi.y = roi.y;
     cv_roi.width = roi.w;
     cv_roi.height = roi.h;
     height = roi.h;
     width = roi.w;
-    std::vector<int64_t> out_shape = {3, height, width};
+    // std::vector<int64_t> out_shape = {3, height, width};
+    std::vector<int64_t> out_shape = {height, width, 3};
     temp.Resize(framework::make_ddim(out_shape));
     platform::CPUPlace cpu;
     // allocate memory and assign to out_image
     auto* data = temp.mutable_data<uint8_t>(cpu);
-    cropped.data = data;
+    // todo jianglielin: why not work?
+    // cropped.data = data;
     image(cv_roi).copyTo(cropped);
-    out->Resize(framework::make_ddim(out_shape));
-    
+
+    std::memcpy(data, cropped.data, 3 * height * width);
+
     TensorCopySync(temp, place, out);
-    
   } else {
+    LOG(ERROR) << "Not Use Opencv decode!!!";
     // throw error
   }
 }
@@ -139,6 +145,9 @@ int NvjpegDecoder::ParseDecodeParams(
     return 1;
 #endif
   }
+  else{
+    // LOG(ERROR) << "Use nvjpeg decode!!!";
+  }
 
   int64_t width = static_cast<int64_t>(widths[0]);
   int64_t height = static_cast<int64_t>(heights[0]);
@@ -174,13 +183,16 @@ int NvjpegDecoder::ParseDecodeParams(
   if (roi_generator) {
     ROI roi;
     roi_generator->GenerateRandomROI(width, height, &roi);
-
+    // roi.x = 0;
+    // roi.y = 0;
+    // roi.w = 500;
+    // roi.h = 400;
     PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetROI(decode_params_, roi.x, roi.y, roi.w, roi.h));
     height = roi.h;
     width = roi.w;
   }
 
-  std::vector<int64_t> out_shape = {output_components, height, width};
+  std::vector<int64_t> out_shape = {height, width, output_components};
   out->Resize(framework::make_ddim(out_shape));
 
   // allocate memory and assign to out_image
@@ -217,7 +229,9 @@ void NvjpegDecoder::Run(
   if (res) {
     return;
   }
+  // LOG(ERROR) << "ParseDecodeParams finish !!!";
   Decode(bit_stream, bit_len, &image);
+  // LOG(ERROR) << "Decode finish !!!";
 }
 
 NvjpegDecoderThreadPool::NvjpegDecoderThreadPool(const int num_threads, const std::string mode, const int dev_id)
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index cfdfa456e39ea..0ee26752aebc3 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -50,6 +50,7 @@ template struct SetConstant<platform::CUDADeviceContext,
   template struct Transpose<platform::CUDADeviceContext, float16, RANK>;  \
   template struct Transpose<platform::CUDADeviceContext, bfloat16, RANK>; \
   template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext, uint8_t, RANK>;  \
   template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>;  \
   template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>;  \
   template struct Transpose<platform::CUDADeviceContext,                  \
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 5faa991d6b576..9c62d437a151f 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -932,6 +932,7 @@ def image_decode(x, mode='unchanged', num_threads=2, name=None):
 def image_decode_random_crop(x,
                              mode='unchanged',
                              num_threads=2,
+                             data_layout='NCHW',
                              aspect_ratio_min=3./4.,
                              aspect_ratio_max=4./3.,
                              area_min=0.08,
@@ -982,7 +983,7 @@ def image_decode_random_crop(x,
                 core.VarDesc.VarType.LOD_TENSOR_ARRAY, False)
         program_id = utils._hash_with_id(mode, num_threads, name, local_rank)
         return _C_ops.batch_decode_random_crop(
-                x, out, "mode", mode, "num_threads", num_threads,
+                x, out, "mode", mode, "num_threads", num_threads, "data_layout", data_layout,
                 "aspect_ratio_min", aspect_ratio_min,
                 "aspect_ratio_max", aspect_ratio_max,
                 "area_min", area_min, "area_max", area_max,
@@ -992,6 +993,7 @@ def image_decode_random_crop(x,
     inputs = {'X': x}
     attrs = {"mode": mode,
              "num_threads": num_threads,
+             "data_layout": data_layout,
              "aspect_ratio_min": aspect_ratio_min,
              "aspect_ratio_max": aspect_ratio_max,
              "area_min": area_min,

From a30d38f95f3766a7f9c49c0088e75e73203da2be Mon Sep 17 00:00:00 2001
From: LielinJiang <jianglielin@baidu.com>
Date: Mon, 14 Feb 2022 06:37:21 +0000
Subject: [PATCH 2/2] clean code

---
 .../data/batch_decode_random_crop_op.h        | 36 -------------------
 .../fluid/operators/data/batch_resize_op.cu   | 13 -------
 .../operators/data/file_label_loader_op.h     |  4 +--
 paddle/fluid/operators/data/nvjpeg_decoder.cc |  9 ++---
 4 files changed, 4 insertions(+), 58 deletions(-)

diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.h b/paddle/fluid/operators/data/batch_decode_random_crop_op.h
index f599c74a7dfb2..de96e38ca95ef 100644
--- a/paddle/fluid/operators/data/batch_decode_random_crop_op.h
+++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.h
@@ -28,42 +28,6 @@ namespace paddle {
 namespace operators {
 namespace data {
 
-// template <typename DeviceContext, typename T>
-// void TransCompute(const int dim, const DeviceContext& dev_ctx,
-//                          const framework::Tensor& in, framework::Tensor* out,
-//                          const std::vector<int>& axis) {
-//   switch (dim) {
-//     case 1:
-//       math::Transpose<DeviceContext, T, 1> trans1;
-//       trans1(dev_ctx, in, out, axis);
-//       break;
-//     case 2:
-//       math::Transpose<DeviceContext, T, 2> trans2;
-//       trans2(dev_ctx, in, out, axis);
-//       break;
-//     case 3:
-//       math::Transpose<DeviceContext, T, 3> trans3;
-//       trans3(dev_ctx, in, out, axis);
-//       break;
-//     case 4:
-//       math::Transpose<DeviceContext, T, 4> trans4;
-//       trans4(dev_ctx, in, out, axis);
-//       break;
-//     case 5:
-//       math::Transpose<DeviceContext, T, 5> trans5;
-//       trans5(dev_ctx, in, out, axis);
-//       break;
-//     case 6:
-//       math::Transpose<DeviceContext, T, 6> trans6;
-//       trans6(dev_ctx, in, out, axis);
-//       break;
-//     default:
-//       // for dim >= 7 situation
-//       math::TransposeNormal<DeviceContext, T> trans_normal;
-//       trans_normal(dev_ctx, in, out, axis);
-//   }
-// }
-
 template <typename T>
 class CPUBatchDecodeRandomCropKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/data/batch_resize_op.cu b/paddle/fluid/operators/data/batch_resize_op.cu
index f7a7f52a3703e..7728a6b463163 100644
--- a/paddle/fluid/operators/data/batch_resize_op.cu
+++ b/paddle/fluid/operators/data/batch_resize_op.cu
@@ -234,18 +234,10 @@ class BatchResizeCUDAKernel : public framework::OpKernel<T> {
     bool align_corners = ctx.Attr<bool>("align_corners");
     int align_mode = ctx.Attr<int>("align_mode");
 
-    // int img_h, img_w;//, idx_h, idx_w, crop_h, crop_w;
-
     auto* img = &x->at(0);
     int64_t img_c = data_layout == DataLayout::kNCHW ? \
                   img->dims()[0] : img->dims()[2];
 
-    LOG(ERROR) << "img channel: " << img_c << " || " << data_layout_str;
-    // img_h =
-    //     data_layout == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0];
-    // img_w =
-    //     data_layout == DataLayout::kNCHW ? img->dims()[2] : img->dims()[1];
-
     std::vector<int64_t> out_dim = {static_cast<int64_t>(x->size()),
                                     size[0], size[1], img_c};
     if (data_layout == DataLayout::kNCHW) {
@@ -255,11 +247,6 @@ class BatchResizeCUDAKernel : public framework::OpKernel<T> {
     out->Resize(framework::make_ddim(out_dim));
     out->mutable_data<T>(ctx.GetPlace());
 
-    // for (int i = 0; i < x->size(); i++) {
-    //   img = &x->at(i);
-    //   auto out_tensor = out->Slice(i, i + 1);
-    //   TensorCopySync(*img, ctx.GetPlace(), &out_tensor);
-    // }
     int img_h, img_w, idx_h, idx_w, crop_h, crop_w;
     for (int i = 0; i < x->size(); i++) {
       img = &x->at(i);
diff --git a/paddle/fluid/operators/data/file_label_loader_op.h b/paddle/fluid/operators/data/file_label_loader_op.h
index ffb08a6439b17..eef07790372a2 100644
--- a/paddle/fluid/operators/data/file_label_loader_op.h
+++ b/paddle/fluid/operators/data/file_label_loader_op.h
@@ -157,8 +157,8 @@ static std::vector<std::pair<std::string, int>>* GetFilesAndLabelsFromCache(cons
   if (iter == root_to_samples_.end()) {
     std::vector<std::pair<std::string, int>> samples;
     ParseFilesAndLabels(data_root, &samples);
-    std::cout << "files 0: " << samples[0].first << std::endl;
-    std::cout << "files 1: " << samples[1].first << std::endl;
+    // std::cout << "files 0: " << samples[0].first << std::endl;
+    // std::cout << "files 1: " << samples[1].first << std::endl;
     LOG(ERROR) << "Init samples: " << samples.size();
     root_to_samples_[data_root] = samples;
   }
diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc
index 56159f81e51f6..188723ba306fe 100644
--- a/paddle/fluid/operators/data/nvjpeg_decoder.cc
+++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc
@@ -85,7 +85,6 @@ void NvjpegDecoder::CPUDecodeRandomCropResize(const uint8_t* data, size_t length
                                 unsigned char* workspace, size_t workspace_size,
                                 framework::LoDTensor& temp, framework::LoDTensor* out, platform::Place place) {
   cv::Mat image =
-      // cv::imdecode(const_cast<unsigned char*>(data), cv::IMREAD_COLOR);
       cv::imdecode(cv::Mat(1, length, CV_8UC1, const_cast<unsigned char*>(data)), cv::IMREAD_COLOR);
   
   cv::Mat cropped;
@@ -95,14 +94,13 @@ void NvjpegDecoder::CPUDecodeRandomCropResize(const uint8_t* data, size_t length
     ROI roi;
     roi_generator->GenerateRandomROI(image.cols, image.rows, &roi);
     cv::Rect cv_roi;
-
     cv_roi.x = roi.x;
     cv_roi.y = roi.y;
     cv_roi.width = roi.w;
     cv_roi.height = roi.h;
     height = roi.h;
     width = roi.w;
-    // std::vector<int64_t> out_shape = {3, height, width};
+
     std::vector<int64_t> out_shape = {height, width, 3};
     temp.Resize(framework::make_ddim(out_shape));
     platform::CPUPlace cpu;
@@ -183,10 +181,7 @@ int NvjpegDecoder::ParseDecodeParams(
   if (roi_generator) {
     ROI roi;
     roi_generator->GenerateRandomROI(width, height, &roi);
-    // roi.x = 0;
-    // roi.y = 0;
-    // roi.w = 500;
-    // roi.h = 400;
+
     PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetROI(decode_params_, roi.x, roi.y, roi.w, roi.h));
     height = roi.h;
     width = roi.w;