From 6b510559a1ea0034965eb16c87f263eeb64e66b4 Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Fri, 16 Sep 2022 11:32:24 +0800
Subject: [PATCH 01/23] add support fp16 for deformable

---
 .../cpu/deformable_conv_grad_kernel.cc        | 231 ++++++++++--------
 .../kernels/funcs/deformable_conv_functor.cc  |  25 +-
 .../kernels/funcs/deformable_conv_functor.cu  |  43 +++-
 .../kernels/funcs/deformable_conv_functor.h   |  55 +++--
 .../gpu/deformable_conv_grad_kernel.cu        |  84 ++++---
 .../phi/kernels/gpu/deformable_conv_kernel.cu |   3 +-
 .../impl/deformable_conv_grad_kernel_impl.h   |  51 ++--
 .../impl/deformable_conv_kernel_impl.h        |   2 +
 .../unittests/test_deformable_conv_v1_op.py   |  27 +-
 9 files changed, 309 insertions(+), 212 deletions(-)
diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
index a4d43ef8fbe89..093da5fe026b2 100644
--- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
@@ -14,15 +14,24 @@
 
 #include "paddle/phi/kernels/deformable_conv_grad_kernel.h"
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h"
 
 namespace phi {
 
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
 template <typename T>
-inline void ModulatedDeformableCol2imCPUKernel(
-    const int num_kernels,
+__global__ void ModulatedDeformableCol2imGpuKernel(
+    const int nthreads,
     const T* data_col,
     const T* data_offset,
     const T* data_mask,
@@ -43,7 +52,10 @@ inline void ModulatedDeformableCol2imCPUKernel(
     const int height_col,
     const int width_col,
     T* grad_im) {
-  for (int thread = 0; thread < num_kernels; thread++) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  for (size_t thread = index; thread < nthreads; thread += offset) {
     const int j = (thread / width_col / height_col / batch_size) % kernel_w;
     const int i =
         (thread / width_col / height_col / batch_size / kernel_w) % kernel_h;
@@ -67,17 +79,17 @@ inline void ModulatedDeformableCol2imCPUKernel(
         ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
     const int data_mask_hw_ptr =
         ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
-    const T offset_h = data_offset_ptr[data_offset_h_ptr];
-    const T offset_w = data_offset_ptr[data_offset_w_ptr];
-    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+    const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
+    const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
+    const MT cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const MT cur_inv_w_data = w_in + j * dilation_w + offset_w;
 
-    T cur_top_grad = data_col[thread];
+    MT cur_top_grad = static_cast<MT>(data_col[thread]);
     if (data_mask) {
       const T* data_mask_ptr =
           data_mask + (b * deformable_group + deformable_group_index) *
                           kernel_h * kernel_w * height_col * width_col;
-      const T mask = data_mask_ptr[data_mask_hw_ptr];
+      const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
       cur_top_grad *= mask;
     }
     const int cur_h = static_cast<int>(cur_inv_h_data);
@@ -89,15 +101,16 @@ inline void ModulatedDeformableCol2imCPUKernel(
             abs(cur_inv_w_data - (cur_w + dx)) < 1) {
           int cur_bottom_grad_pos =
               ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          T weight = DmcnGetGradientWeight(cur_inv_h_data,
-                                           cur_inv_w_data,
-                                           cur_h + dy,
-                                           cur_w + dx,
-                                           height,
-                                           width);
+          MT weight = DmcnGetGradientWeight(cur_inv_h_data,
+                                            cur_inv_w_data,
+                                            cur_h + dy,
+                                            cur_w + dx,
+                                            height,
+                                            width);
 
-          *(grad_im + cur_bottom_grad_pos) =
-              *(grad_im + cur_bottom_grad_pos) + weight * cur_top_grad;
+          paddle::platform::CudaAtomicAdd(
+              grad_im + cur_bottom_grad_pos,
+              static_cast<T>(weight * cur_top_grad));
         }
       }
     }
@@ -119,33 +132,36 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
                                T* grad_im) {
   int channel_per_deformable_group = im_shape[0] / deformable_group;
   int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
+  int blocks = NumBlocks(num_kernels);
+  int threads = kNumCUDAThreads;
 
-  ModulatedDeformableCol2imCPUKernel(num_kernels,
-                                     data_col,
-                                     data_offset,
-                                     data_mask,
-                                     im_shape[0],
-                                     im_shape[1],
-                                     im_shape[2],
-                                     kernel_shape[2],
-                                     kernel_shape[3],
-                                     pad[0],
-                                     pad[1],
-                                     stride[0],
-                                     stride[1],
-                                     dilation[0],
-                                     dilation[1],
-                                     channel_per_deformable_group,
-                                     col_shape[1],
-                                     deformable_group,
-                                     col_shape[2],
-                                     col_shape[3],
-                                     grad_im);
+  ModulatedDeformableCol2imGpuKernel<T>
+      <<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
+                                                 data_col,
+                                                 data_offset,
+                                                 data_mask,
+                                                 im_shape[0],
+                                                 im_shape[1],
+                                                 im_shape[2],
+                                                 kernel_shape[2],
+                                                 kernel_shape[3],
+                                                 pad[0],
+                                                 pad[1],
+                                                 stride[0],
+                                                 stride[1],
+                                                 dilation[0],
+                                                 dilation[1],
+                                                 channel_per_deformable_group,
+                                                 col_shape[1],
+                                                 deformable_group,
+                                                 col_shape[2],
+                                                 col_shape[3],
+                                                 grad_im);
 }
 
 template <typename T>
-void ModulatedDeformableCol2imCoordCPUKernel(
-    const int num_kernels,
+__global__ void ModulatedDeformableCol2imCoordGpuKernel(
+    const int nthreads,
     const T* data_col,
     const T* data_im,
     const T* data_offset,
@@ -169,8 +185,11 @@ void ModulatedDeformableCol2imCoordCPUKernel(
     const int width_col,
     T* grad_offset,
     T* grad_mask) {
-  for (int i = 0; i < num_kernels; i++) {
-    T val = 0, mval = 0;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  for (size_t i = index; i < nthreads; i += offset) {
+    MT val = 0, mval = 0;
     const int w = i % width_col;
     const int h = (i / width_col) % height_col;
     const int c = (i / width_col / height_col) % offset_channels;
@@ -215,40 +234,42 @@ void ModulatedDeformableCol2imCoordCPUKernel(
       const int data_offset_w_ptr =
           (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
            w_out);
-      const T offset_h = data_offset_ptr[data_offset_h_ptr];
-      const T offset_w = data_offset_ptr[data_offset_w_ptr];
-      T inv_h = h_in + i * dilation_h + offset_h;
-      T inv_w = w_in + j * dilation_w + offset_w;
+
+      const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
+      const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
+      MT inv_h = h_in + i * dilation_h + offset_h;
+      MT inv_w = w_in + j * dilation_w + offset_w;
       if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
         inv_h = inv_w = -2;
       } else {
-        mval += data_col_ptr[col_pos] *
-                funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width,
-                                          width,
-                                          height,
-                                          width,
-                                          inv_h,
-                                          inv_w);
+        mval +=
+            static_cast<MT>(data_col_ptr[col_pos]) *
+            funcs::DmcnIm2colBilinear<T, MT>(data_im_ptr + cnt * height * width,
+                                             width,
+                                             height,
+                                             width,
+                                             inv_h,
+                                             inv_w);
       }
-      const T weight =
-          DmcnGetCoordinateWeight(inv_h,
-                                  inv_w,
-                                  height,
-                                  width,
-                                  data_im_ptr + cnt * height * width,
-                                  width,
-                                  bp_dir);
+      const MT weight =
+          DmcnGetCoordinateWeight<T, MT>(inv_h,
+                                         inv_w,
+                                         height,
+                                         width,
+                                         data_im_ptr + cnt * height * width,
+                                         width,
+                                         bp_dir);
       if (data_mask_ptr) {
         const int data_mask_hw_ptr =
             (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
-        const T mask = data_mask_ptr[data_mask_hw_ptr];
-        val += weight * data_col_ptr[col_pos] * mask;
+        const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
+        val += weight * static_cast<MT>(data_col_ptr[col_pos]) * mask;
       } else {
-        val += weight * data_col_ptr[col_pos];
+        val += weight * static_cast<MT>(data_col_ptr[col_pos]);
       }
       cnt += 1;
     }
-    grad_offset[i] = val;
+    grad_offset[i] = static_cast<T>(val);
     if (grad_mask && offset_c % 2 == 0)
       grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
                       kernel_w +
@@ -278,32 +299,49 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
   int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
                     col_shape[2] * col_shape[3] * deformable_groups;
   int channel_per_deformable_group = col_shape[0] / deformable_groups;
+  int blocks = NumBlocks(num_kernels);
+  int threads = kNumCUDAThreads;
 
-  ModulatedDeformableCol2imCoordCPUKernel(
-      num_kernels,
-      data_col,
-      data_im,
-      data_offset,
-      data_mask,
-      im_shape[0],
-      im_shape[1],
-      im_shape[2],
-      kernel_shape[2],
-      kernel_shape[3],
-      paddings[0],
-      paddings[1],
-      strides[0],
-      strides[1],
-      dilations[0],
-      dilations[1],
-      channel_per_deformable_group,
-      col_shape[1],
-      2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
-      deformable_groups,
-      col_shape[2],
-      col_shape[3],
-      grad_offset,
-      grad_mask);
+  ModulatedDeformableCol2imCoordGpuKernel<T>
+      <<<blocks, threads, 0, dev_ctx.stream()>>>(
+          num_kernels,
+          data_col,
+          data_im,
+          data_offset,
+          data_mask,
+          im_shape[0],
+          im_shape[1],
+          im_shape[2],
+          kernel_shape[2],
+          kernel_shape[3],
+          paddings[0],
+          paddings[1],
+          strides[0],
+          strides[1],
+          dilations[0],
+          dilations[1],
+          channel_per_deformable_group,
+          col_shape[1],
+          2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
+          deformable_groups,
+          col_shape[2],
+          col_shape[3],
+          grad_offset,
+          grad_mask);
+}
+
+template <typename T>
+__global__ void FilterGradAddupGpuKernel(const int nthreads,
+                                         const int n,
+                                         const int height,
+                                         const int width,
+                                         const T* dweight_3d,
+                                         T* filter_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    filter_grad[i] = filter_grad[i] + dweight_3d[i];
+  }
 }
 
 template <typename T, typename Context>
@@ -314,16 +352,17 @@ void FilterGradAddup(const Context& dev_ctx,
                      const int width,
                      const T* dweight_3d,
                      T* filter_grad) {
-  for (int i = 0; i < nthreads; i++) {
-    filter_grad[i] = filter_grad[i] + dweight_3d[i];
-  }
+  FilterGradAddupGpuKernel<T>
+      <<<NumBlocks(nthreads), kNumCUDAThreads, 0, dev_ctx.stream()>>>(
+          nthreads, n, height, width, dweight_3d, filter_grad);
 }
 
 }  // namespace phi
 
 PD_REGISTER_KERNEL(deformable_conv_grad,
-                   CPU,
+                   GPU,
                    ALL_LAYOUT,
                    phi::DeformableConvGradKernel,
                    float,
-                   double) {}
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
index 48858fa59390e..f6483b9cdd2e8 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
 
 namespace phi {
 namespace funcs {
@@ -42,6 +44,7 @@ inline void ModulatedDeformableIm2colCPUKernel(
     const int height_col,
     const int width_col,
     T* data_col) {
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   for (int i = 0; i < num_kernels; i++) {
     const int w_col = i % width_col;
     const int h_col = (i / width_col) % height_col;
@@ -76,22 +79,22 @@ inline void ModulatedDeformableIm2colCPUKernel(
             ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
             w_col;
 
-        const T offset_h = data_offset_ptr[data_offset_h_ptr];
-        const T offset_w = data_offset_ptr[data_offset_w_ptr];
-        T val = static_cast<T>(0);
-        const T h_im = h_in + i * dilation_h + offset_h;
-        const T w_im = w_in + j * dilation_w + offset_w;
+        const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
+        const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
+        MT val = static_cast<MT>(0);
+        const MT h_im = h_in + i * dilation_h + offset_h;
+        const MT w_im = w_in + j * dilation_w + offset_w;
         if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
-          val =
-              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
+          val = DmcnIm2colBilinear<T, MT>(
+              data_im_ptr, width, height, width, h_im, w_im);
         }
-        *data_col_ptr = val;
         if (data_mask_ptr) {
           const int data_mask_hw_ptr =
               ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
-          const T mask = data_mask_ptr[data_mask_hw_ptr];
-          *data_col_ptr *= mask;
+          const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
+          val *= mask;
         }
+        *data_col_ptr = static_cast<T>(val);
         data_col_ptr += batch_size * height_col * width_col;
       }
     }
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
index bebea5dcb74ca..7ae2526406f2b 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
@@ -12,8 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/float16.h"
 
 namespace phi {
 namespace funcs {
@@ -51,6 +54,8 @@ __global__ void ModulatedDeformableIm2colGpuKernel(
     T* data_col) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
+
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   for (size_t i = index; i < nthreads; i += offset) {
     const int w_col = i % width_col;
     const int h_col = (i / width_col) % height_col;
@@ -77,6 +82,8 @@ __global__ void ModulatedDeformableIm2colGpuKernel(
                               kernel_h * kernel_w * height_col * width_col
             : nullptr;
 
+    // 上边都动不了，指针移动也不会引起误差和速度降低
+
     for (int i = 0; i < kernel_h; ++i) {
       for (int j = 0; j < kernel_w; ++j) {
         const int data_offset_h_ptr =
@@ -85,22 +92,22 @@ __global__ void ModulatedDeformableIm2colGpuKernel(
             ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
             w_col;
 
-        const T offset_h = data_offset_ptr[data_offset_h_ptr];
-        const T offset_w = data_offset_ptr[data_offset_w_ptr];
-        T val = static_cast<T>(0);
-        const T h_im = h_in + i * dilation_h + offset_h;
-        const T w_im = w_in + j * dilation_w + offset_w;
+        const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
+        const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
+        MT val = static_cast<MT>(0);
+        const MT h_im = h_in + i * dilation_h + offset_h;
+        const MT w_im = w_in + j * dilation_w + offset_w;
         if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
-          val =
-              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
+          val = DmcnIm2colBilinear<T, MT>(
+              data_im_ptr, width, height, width, h_im, w_im);
         }
-        *data_col_ptr = val;
         if (data_mask_ptr) {
           const int data_mask_hw_ptr =
               ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
-          const T mask = data_mask_ptr[data_mask_hw_ptr];
-          *data_col_ptr *= mask;
+          const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
+          val *= mask;
         }
+        *data_col_ptr = static_cast<T>(val);
         data_col_ptr += batch_size * height_col * width_col;
       }
     }
@@ -164,6 +171,20 @@ template void ModulatedDeformableIm2col(
     const int deformable_groups,
     float* data_col);
 
+template void ModulatedDeformableIm2col(
+    const phi::GPUContext& dev_ctx,
+    const phi::dtype::float16* data_im,
+    const phi::dtype::float16* data_offset,
+    const phi::dtype::float16* data_mask,
+    const std::vector<int64_t>& im_shape,
+    const std::vector<int64_t>& col_shape,
+    const std::vector<int64_t>& filter_shape,
+    const std::vector<int>& paddings,
+    const std::vector<int>& strides,
+    const std::vector<int>& dilations,
+    const int deformable_groups,
+    phi::dtype::float16* data_col);
+
 template void ModulatedDeformableIm2col(
     const phi::GPUContext& dev_ctx,
     const double* data_im,
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.h b/paddle/phi/kernels/funcs/deformable_conv_functor.h
index eecda72927510..62e42cd58334f 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.h
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.h
@@ -14,44 +14,47 @@
 
 #pragma once
 
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
 namespace funcs {
 
-template <typename T>
-HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data,
-                                const int data_width,
-                                const int height,
-                                const int width,
-                                T h,
-                                T w) {
+template <typename T, typename MT>
+HOSTDEVICE MT DmcnIm2colBilinear(const T* bottom_data,
+                                 const int data_width,
+                                 const int height,
+                                 const int width,
+                                 MT h,
+                                 MT w) {
   int h_low = floor(h);
   int w_low = floor(w);
   int h_high = h_low + 1;
   int w_high = w_low + 1;
 
-  T lh = h - h_low;
-  T lw = w - w_low;
-  T hh = 1 - lh;
-  T hw = 1 - lw;
+  MT lh = h - h_low;
+  MT lw = w - w_low;
+  MT hh = 1 - lh;
+  MT hw = 1 - lw;
 
-  T v1 =
-      (h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0;
-  T v2 = (h_low >= 0 && w_high <= width - 1)
-             ? bottom_data[h_low * data_width + w_high]
-             : 0;
-  T v3 = (h_high <= height - 1 && w_low >= 0)
-             ? bottom_data[h_high * data_width + w_low]
-             : 0;
-  T v4 = (h_high <= height - 1 && w_high <= width - 1)
-             ? bottom_data[h_high * data_width + w_high]
-             : 0;
+  MT v1 = (h_low >= 0 && w_low >= 0)
+              ? static_cast<MT>(bottom_data[h_low * data_width + w_low])
+              : 0;
+  MT v2 = (h_low >= 0 && w_high <= width - 1)
+              ? static_cast<MT>(bottom_data[h_low * data_width + w_high])
+              : 0;
+  MT v3 = (h_high <= height - 1 && w_low >= 0)
+              ? static_cast<MT>(bottom_data[h_high * data_width + w_low])
+              : 0;
+  MT v4 = (h_high <= height - 1 && w_high <= width - 1)
+              ? static_cast<MT>(bottom_data[h_high * data_width + w_high])
+              : 0;
 
-  T w1 = hh * hw;
-  T w2 = hh * lw;
-  T w3 = lh * hw;
-  T w4 = lh * lw;
+  MT w1 = hh * hw;
+  MT w2 = hh * lw;
+  MT w3 = lh * hw;
+  MT w4 = lh * lw;
 
   return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
 }
diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index b46f1f4a3314d..093da5fe026b2 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -54,6 +54,7 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
     T* grad_im) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   for (size_t thread = index; thread < nthreads; thread += offset) {
     const int j = (thread / width_col / height_col / batch_size) % kernel_w;
     const int i =
@@ -78,17 +79,17 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
         ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
     const int data_mask_hw_ptr =
         ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
-    const T offset_h = data_offset_ptr[data_offset_h_ptr];
-    const T offset_w = data_offset_ptr[data_offset_w_ptr];
-    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+    const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
+    const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
+    const MT cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const MT cur_inv_w_data = w_in + j * dilation_w + offset_w;
 
-    T cur_top_grad = data_col[thread];
+    MT cur_top_grad = static_cast<MT>(data_col[thread]);
     if (data_mask) {
       const T* data_mask_ptr =
           data_mask + (b * deformable_group + deformable_group_index) *
                           kernel_h * kernel_w * height_col * width_col;
-      const T mask = data_mask_ptr[data_mask_hw_ptr];
+      const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
       cur_top_grad *= mask;
     }
     const int cur_h = static_cast<int>(cur_inv_h_data);
@@ -100,15 +101,16 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
             abs(cur_inv_w_data - (cur_w + dx)) < 1) {
           int cur_bottom_grad_pos =
               ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          T weight = DmcnGetGradientWeight(cur_inv_h_data,
-                                           cur_inv_w_data,
-                                           cur_h + dy,
-                                           cur_w + dx,
-                                           height,
-                                           width);
+          MT weight = DmcnGetGradientWeight(cur_inv_h_data,
+                                            cur_inv_w_data,
+                                            cur_h + dy,
+                                            cur_w + dx,
+                                            height,
+                                            width);
 
-          paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos,
-                                          weight * cur_top_grad);
+          paddle::platform::CudaAtomicAdd(
+              grad_im + cur_bottom_grad_pos,
+              static_cast<T>(weight * cur_top_grad));
         }
       }
     }
@@ -185,8 +187,9 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
     T* grad_mask) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   for (size_t i = index; i < nthreads; i += offset) {
-    T val = 0, mval = 0;
+    MT val = 0, mval = 0;
     const int w = i % width_col;
     const int h = (i / width_col) % height_col;
     const int c = (i / width_col / height_col) % offset_channels;
@@ -231,40 +234,42 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
       const int data_offset_w_ptr =
           (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
            w_out);
-      const T offset_h = data_offset_ptr[data_offset_h_ptr];
-      const T offset_w = data_offset_ptr[data_offset_w_ptr];
-      T inv_h = h_in + i * dilation_h + offset_h;
-      T inv_w = w_in + j * dilation_w + offset_w;
+
+      const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
+      const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
+      MT inv_h = h_in + i * dilation_h + offset_h;
+      MT inv_w = w_in + j * dilation_w + offset_w;
       if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
         inv_h = inv_w = -2;
       } else {
-        mval += data_col_ptr[col_pos] *
-                funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width,
-                                          width,
-                                          height,
-                                          width,
-                                          inv_h,
-                                          inv_w);
+        mval +=
+            static_cast<MT>(data_col_ptr[col_pos]) *
+            funcs::DmcnIm2colBilinear<T, MT>(data_im_ptr + cnt * height * width,
+                                             width,
+                                             height,
+                                             width,
+                                             inv_h,
+                                             inv_w);
       }
-      const T weight =
-          DmcnGetCoordinateWeight(inv_h,
-                                  inv_w,
-                                  height,
-                                  width,
-                                  data_im_ptr + cnt * height * width,
-                                  width,
-                                  bp_dir);
+      const MT weight =
+          DmcnGetCoordinateWeight<T, MT>(inv_h,
+                                         inv_w,
+                                         height,
+                                         width,
+                                         data_im_ptr + cnt * height * width,
+                                         width,
+                                         bp_dir);
       if (data_mask_ptr) {
         const int data_mask_hw_ptr =
             (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
-        const T mask = data_mask_ptr[data_mask_hw_ptr];
-        val += weight * data_col_ptr[col_pos] * mask;
+        const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
+        val += weight * static_cast<MT>(data_col_ptr[col_pos]) * mask;
       } else {
-        val += weight * data_col_ptr[col_pos];
+        val += weight * static_cast<MT>(data_col_ptr[col_pos]);
       }
       cnt += 1;
     }
-    grad_offset[i] = val;
+    grad_offset[i] = static_cast<T>(val);
     if (grad_mask && offset_c % 2 == 0)
       grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
                       kernel_w +
@@ -359,4 +364,5 @@ PD_REGISTER_KERNEL(deformable_conv_grad,
                    ALL_LAYOUT,
                    phi::DeformableConvGradKernel,
                    float,
-                   double) {}
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
index 2476dcbafb984..021791ca93061 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
@@ -23,4 +23,5 @@ PD_REGISTER_KERNEL(deformable_conv,
                    ALL_LAYOUT,
                    phi::DeformableConvKernel,
                    float,
-                   double) {}
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index 744c48b2bfbd6..90a5243fc72d4 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -58,14 +59,14 @@ HOSTDEVICE T DmcnGetGradientWeight(T argmax_h,
   return weight;
 }
 
-template <typename T>
-HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h,
-                                     T argmax_w,
-                                     const int height,
-                                     const int width,
-                                     const T* im_data,
-                                     const int data_width,
-                                     const int bp_dir) {
+template <typename T, typename MT>
+HOSTDEVICE MT DmcnGetCoordinateWeight(MT argmax_h,
+                                      MT argmax_w,
+                                      const int height,
+                                      const int width,
+                                      const T* im_data,
+                                      const int data_width,
+                                      const int bp_dir) {
   if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
       argmax_w >= width) {
     return 0;
@@ -76,43 +77,51 @@ HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h,
   int argmax_h_high = argmax_h_low + 1;
   int argmax_w_high = argmax_w_low + 1;
 
-  T weight = 0;
+  MT weight = 0;
 
   if (bp_dir == 0) {
     weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
                   ? -1 * (argmax_w_low + 1 - argmax_w) *
-                        im_data[argmax_h_low * data_width + argmax_w_low]
+                        static_cast<MT>(
+                            im_data[argmax_h_low * data_width + argmax_w_low])
                   : 0;
 
     weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
                   ? -1 * (argmax_w - argmax_w_low) *
-                        im_data[argmax_h_low * data_width + argmax_w_high]
+                        static_cast<MT>(
+                            im_data[argmax_h_low * data_width + argmax_w_high])
                   : 0;
 
     weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
                   ? (argmax_w_low + 1 - argmax_w) *
-                        im_data[argmax_h_high * data_width + argmax_w_low]
+                        static_cast<MT>(
+                            im_data[argmax_h_high * data_width + argmax_w_low])
                   : 0;
     weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
                   ? (argmax_w - argmax_w_low) *
-                        im_data[argmax_h_high * data_width + argmax_w_high]
+                        static_cast<MT>(
+                            im_data[argmax_h_high * data_width + argmax_w_high])
                   : 0;
   } else if (bp_dir == 1) {
     weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
                   ? -1 * (argmax_h_low + 1 - argmax_h) *
-                        im_data[argmax_h_low * data_width + argmax_w_low]
+                        static_cast<MT>(
+                            im_data[argmax_h_low * data_width + argmax_w_low])
                   : 0;
     weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
                   ? (argmax_h_low + 1 - argmax_h) *
-                        im_data[argmax_h_low * data_width + argmax_w_high]
+                        static_cast<MT>(
+                            im_data[argmax_h_low * data_width + argmax_w_high])
                   : 0;
     weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
                   ? -1 * (argmax_h - argmax_h_low) *
-                        im_data[argmax_h_high * data_width + argmax_w_low]
+                        static_cast<MT>(
+                            im_data[argmax_h_high * data_width + argmax_w_low])
                   : 0;
     weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
                   ? (argmax_h - argmax_h_low) *
-                        im_data[argmax_h_high * data_width + argmax_w_high]
+                        static_cast<MT>(
+                            im_data[argmax_h_high * data_width + argmax_w_high])
                   : 0;
   }
 
@@ -290,7 +299,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
           deformable_groups,
           offset_grad_ptr + i * im2col_step * input_offset_dim,
           mask_grad_data_ptr);
-    }
+    }  // check
     if (dx) {
       T* dx_ptr = dx->data<T>();
       // get grad of input
@@ -305,7 +314,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                                 strides,
                                 dilations,
                                 deformable_groups,
-                                dx_ptr + i * im2col_step * input_dim);
+                                dx_ptr + i * im2col_step * input_dim);  //待改
       dx->Resize(x.dims());
     }
 
@@ -321,7 +330,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
         strides,
         dilations,
         deformable_groups,
-        col_buffer_ptr);
+        col_buffer_ptr);  // check
 
     col_buffer_3d.Resize(col_buffer_3d_shape);
 
@@ -353,7 +362,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                          K,
                          M,
                          dweight_3d.data<T>(),
-                         filter_grad->data<T>());
+                         filter_grad->data<T>());  // 待改
     }
   }
   if (filter_grad) {
diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
index f864c2e5f0ed0..6bda245f3bda9 100644
--- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -38,6 +39,7 @@ void DeformableConvKernel(const Context& dev_ctx,
                           DenseTensor* out) {
   const int batch_size = static_cast<int>(x.dims()[0]);
 
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
   std::vector<int64_t> output_shape_vec(phi::vectorize(out->dims()));
 
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
index a60881e8dded2..57afd9f45aa27 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
@@ -23,6 +23,9 @@
 from paddle.fluid.framework import _test_eager_guard
 
 
+paddle.enable_static()
+
+
 def dmc_bilinear(data_im, height, width, h, w):
     h_low = int(np.floor(h))
     w_low = int(np.floor(w))
@@ -61,8 +64,8 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param):
     assert f_c * group == in_c
     assert np.mod(out_c, group) == 0
 
-    stride, pad, dilation = conv_param['stride'], conv_param['pad'],\
-        conv_param['dilation']
+    stride, pad, dilation = conv_param['stride'], conv_param['pad'], \
+                            conv_param['dilation']
     out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
     out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
     assert out_h == in_h
@@ -76,18 +79,18 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param):
                     for kh in range(f_h):
                         for kw in range(f_w):
                             offset_h_table = \
-                                    offset[n, ::2, h, w].reshape(f_h, f_w)
+                                offset[n, ::2, h, w].reshape(f_h, f_w)
                             offset_w_table = \
-                                    offset[n, 1::2, h, w].reshape(f_h, f_w)
+                                offset[n, 1::2, h, w].reshape(f_h, f_w)
                             offset_h = offset_h_table[kh, kw]
                             offset_w = offset_w_table[kh, kw]
                             val = 0
                             im_h = h * stride[0] + kh * dilation[0] \
-                                + offset_h - pad[0]
+                                   + offset_h - pad[0]
                             im_w = w * stride[0] + kw * dilation[0] \
-                                + offset_w - pad[1]
+                                   + offset_w - pad[1]
                             if im_h > -1 and im_w > -1 and \
-                                im_h < in_h and im_w < in_h:
+                                    im_h < in_h and im_w < in_h:
                                 val = dmc_bilinear(input[n, c], in_h, in_w,
                                                    im_h, im_w)
                             val_out = val
@@ -201,6 +204,7 @@ def init_type(self):
         self.dtype = np.float32
 
 
+
 class TestWithStride(TestModulatedDeformableConvOp):
 
     def init_test_case(self):
@@ -286,6 +290,15 @@ def init_type(self):
         self.dtype = np.float64
 
 
+class TestWithFloat16(TestModulatedDeformableConvOp):
+
+    def init_type(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(check_eager=True, atol=1e-3)
+
+
 class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase):
 
     def test_error(self):

From 3fa8f48afc683798debbb7e70c0d9e73a75cc6f5 Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Fri, 16 Sep 2022 11:35:04 +0800
Subject: [PATCH 02/23] fix error push

---
 .../cpu/deformable_conv_grad_kernel.cc        | 172 +++++++-----------
 1 file changed, 68 insertions(+), 104 deletions(-)

diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
index 093da5fe026b2..f6ff5d63372a0 100644
--- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
@@ -14,24 +14,15 @@
 
 #include "paddle/phi/kernels/deformable_conv_grad_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h"
 
 namespace phi {
 
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaximumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaximumNumBlocks);
-}
-
 template <typename T>
-__global__ void ModulatedDeformableCol2imGpuKernel(
-    const int nthreads,
+inline void ModulatedDeformableCol2imCPUKernel(
+    const int num_kernels,
     const T* data_col,
     const T* data_offset,
     const T* data_mask,
@@ -52,10 +43,9 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
     const int height_col,
     const int width_col,
     T* grad_im) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
+
   using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-  for (size_t thread = index; thread < nthreads; thread += offset) {
+  for (int thread = 0; thread < num_kernels; thread++) {
     const int j = (thread / width_col / height_col / batch_size) % kernel_w;
     const int i =
         (thread / width_col / height_col / batch_size / kernel_w) % kernel_h;
@@ -108,9 +98,8 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
                                             height,
                                             width);
 
-          paddle::platform::CudaAtomicAdd(
-              grad_im + cur_bottom_grad_pos,
-              static_cast<T>(weight * cur_top_grad));
+          *(grad_im + cur_bottom_grad_pos) =
+              *(grad_im + cur_bottom_grad_pos) + static_cast<T>(weight * cur_top_grad);
         }
       }
     }
@@ -132,36 +121,33 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
                                T* grad_im) {
   int channel_per_deformable_group = im_shape[0] / deformable_group;
   int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
-  int blocks = NumBlocks(num_kernels);
-  int threads = kNumCUDAThreads;
 
-  ModulatedDeformableCol2imGpuKernel<T>
-      <<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
-                                                 data_col,
-                                                 data_offset,
-                                                 data_mask,
-                                                 im_shape[0],
-                                                 im_shape[1],
-                                                 im_shape[2],
-                                                 kernel_shape[2],
-                                                 kernel_shape[3],
-                                                 pad[0],
-                                                 pad[1],
-                                                 stride[0],
-                                                 stride[1],
-                                                 dilation[0],
-                                                 dilation[1],
-                                                 channel_per_deformable_group,
-                                                 col_shape[1],
-                                                 deformable_group,
-                                                 col_shape[2],
-                                                 col_shape[3],
-                                                 grad_im);
+  ModulatedDeformableCol2imCPUKernel(num_kernels,
+                                     data_col,
+                                     data_offset,
+                                     data_mask,
+                                     im_shape[0],
+                                     im_shape[1],
+                                     im_shape[2],
+                                     kernel_shape[2],
+                                     kernel_shape[3],
+                                     pad[0],
+                                     pad[1],
+                                     stride[0],
+                                     stride[1],
+                                     dilation[0],
+                                     dilation[1],
+                                     channel_per_deformable_group,
+                                     col_shape[1],
+                                     deformable_group,
+                                     col_shape[2],
+                                     col_shape[3],
+                                     grad_im);
 }
 
 template <typename T>
-__global__ void ModulatedDeformableCol2imCoordGpuKernel(
-    const int nthreads,
+void ModulatedDeformableCol2imCoordCPUKernel(
+    const int num_kernels,
     const T* data_col,
     const T* data_im,
     const T* data_offset,
@@ -185,10 +171,8 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
     const int width_col,
     T* grad_offset,
     T* grad_mask) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
   using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-  for (size_t i = index; i < nthreads; i += offset) {
+  for (int i = 0; i < num_kernels; i++) {
     MT val = 0, mval = 0;
     const int w = i % width_col;
     const int h = (i / width_col) % height_col;
@@ -234,7 +218,6 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
       const int data_offset_w_ptr =
           (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
            w_out);
-
       const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
       const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
       MT inv_h = h_in + i * dilation_h + offset_h;
@@ -242,14 +225,13 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
       if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
         inv_h = inv_w = -2;
       } else {
-        mval +=
-            static_cast<MT>(data_col_ptr[col_pos]) *
-            funcs::DmcnIm2colBilinear<T, MT>(data_im_ptr + cnt * height * width,
-                                             width,
-                                             height,
-                                             width,
-                                             inv_h,
-                                             inv_w);
+        mval += static_cast<MT>(data_col_ptr[col_pos]) *
+                funcs::DmcnIm2colBilinear<T, MT>(data_im_ptr + cnt * height * width,
+                                                 width,
+                                                 height,
+                                                 width,
+                                                 inv_h,
+                                                 inv_w);
       }
       const MT weight =
           DmcnGetCoordinateWeight<T, MT>(inv_h,
@@ -299,49 +281,32 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
   int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
                     col_shape[2] * col_shape[3] * deformable_groups;
   int channel_per_deformable_group = col_shape[0] / deformable_groups;
-  int blocks = NumBlocks(num_kernels);
-  int threads = kNumCUDAThreads;
-
-  ModulatedDeformableCol2imCoordGpuKernel<T>
-      <<<blocks, threads, 0, dev_ctx.stream()>>>(
-          num_kernels,
-          data_col,
-          data_im,
-          data_offset,
-          data_mask,
-          im_shape[0],
-          im_shape[1],
-          im_shape[2],
-          kernel_shape[2],
-          kernel_shape[3],
-          paddings[0],
-          paddings[1],
-          strides[0],
-          strides[1],
-          dilations[0],
-          dilations[1],
-          channel_per_deformable_group,
-          col_shape[1],
-          2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
-          deformable_groups,
-          col_shape[2],
-          col_shape[3],
-          grad_offset,
-          grad_mask);
-}
 
-template <typename T>
-__global__ void FilterGradAddupGpuKernel(const int nthreads,
-                                         const int n,
-                                         const int height,
-                                         const int width,
-                                         const T* dweight_3d,
-                                         T* filter_grad) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    filter_grad[i] = filter_grad[i] + dweight_3d[i];
-  }
+  ModulatedDeformableCol2imCoordCPUKernel(
+      num_kernels,
+      data_col,
+      data_im,
+      data_offset,
+      data_mask,
+      im_shape[0],
+      im_shape[1],
+      im_shape[2],
+      kernel_shape[2],
+      kernel_shape[3],
+      paddings[0],
+      paddings[1],
+      strides[0],
+      strides[1],
+      dilations[0],
+      dilations[1],
+      channel_per_deformable_group,
+      col_shape[1],
+      2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
+      deformable_groups,
+      col_shape[2],
+      col_shape[3],
+      grad_offset,
+      grad_mask);
 }
 
 template <typename T, typename Context>
@@ -352,17 +317,16 @@ void FilterGradAddup(const Context& dev_ctx,
                      const int width,
                      const T* dweight_3d,
                      T* filter_grad) {
-  FilterGradAddupGpuKernel<T>
-      <<<NumBlocks(nthreads), kNumCUDAThreads, 0, dev_ctx.stream()>>>(
-          nthreads, n, height, width, dweight_3d, filter_grad);
+  for (int i = 0; i < nthreads; i++) {
+    filter_grad[i] = filter_grad[i] + dweight_3d[i];
+  }
 }
 
 }  // namespace phi
 
 PD_REGISTER_KERNEL(deformable_conv_grad,
-                   GPU,
+                   CPU,
                    ALL_LAYOUT,
                    phi::DeformableConvGradKernel,
                    float,
-                   double,
-                   phi::dtype::float16) {}
+                   double) {}

From f6c8424428108ab4aeed6124aeaf8ee794800385 Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Fri, 16 Sep 2022 14:09:44 +0800
Subject: [PATCH 03/23] fix slow part

---
 paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index 093da5fe026b2..0704234694d9c 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -108,9 +108,9 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
                                             height,
                                             width);
 
-          paddle::platform::CudaAtomicAdd(
-              grad_im + cur_bottom_grad_pos,
-              static_cast<T>(weight * cur_top_grad));
+//          慢在这 ，不支持float16
+          *(grad_im + cur_bottom_grad_pos) =
+              *(grad_im + cur_bottom_grad_pos) + static_cast<T>(weight * cur_top_grad);
         }
       }
     }

From 62990e7888a1c106f934392ea7404d0fa40cb667 Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Fri, 16 Sep 2022 20:30:49 +0800
Subject: [PATCH 04/23] set datetype

---
 paddle/phi/kernels/gpu/deformable_conv_kernel.cu | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
index 021791ca93061..31348fbb2c28d 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
@@ -24,4 +24,9 @@ PD_REGISTER_KERNEL(deformable_conv,
                    phi::DeformableConvKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}

From 2820e9b3f7ae103a4ddf18bcd5936aa3117d690c Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Mon, 19 Sep 2022 11:10:44 +0800
Subject: [PATCH 05/23] fix im2col_step bug and codestyle

---
 .../cpu/deformable_conv_grad_kernel.cc        | 19 +++---
 .../gpu/deformable_conv_grad_kernel.cu        |  5 +-
 .../impl/deformable_conv_kernel_impl.h        | 68 +++++++++++++++----
 .../unittests/test_deformable_conv_v1_op.py   |  3 -
 4 files changed, 69 insertions(+), 26 deletions(-)

diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
index f6ff5d63372a0..962576154a7bf 100644
--- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
@@ -43,7 +43,6 @@ inline void ModulatedDeformableCol2imCPUKernel(
     const int height_col,
     const int width_col,
     T* grad_im) {
-
   using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   for (int thread = 0; thread < num_kernels; thread++) {
     const int j = (thread / width_col / height_col / batch_size) % kernel_w;
@@ -99,7 +98,8 @@ inline void ModulatedDeformableCol2imCPUKernel(
                                             width);
 
           *(grad_im + cur_bottom_grad_pos) =
-              *(grad_im + cur_bottom_grad_pos) + static_cast<T>(weight * cur_top_grad);
+              *(grad_im + cur_bottom_grad_pos) +
+              static_cast<T>(weight * cur_top_grad);
         }
       }
     }
@@ -225,13 +225,14 @@ void ModulatedDeformableCol2imCoordCPUKernel(
       if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
         inv_h = inv_w = -2;
       } else {
-        mval += static_cast<MT>(data_col_ptr[col_pos]) *
-                funcs::DmcnIm2colBilinear<T, MT>(data_im_ptr + cnt * height * width,
-                                                 width,
-                                                 height,
-                                                 width,
-                                                 inv_h,
-                                                 inv_w);
+        mval +=
+            static_cast<MT>(data_col_ptr[col_pos]) *
+            funcs::DmcnIm2colBilinear<T, MT>(data_im_ptr + cnt * height * width,
+                                             width,
+                                             height,
+                                             width,
+                                             inv_h,
+                                             inv_w);
       }
       const MT weight =
           DmcnGetCoordinateWeight<T, MT>(inv_h,
diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index 0704234694d9c..a87da6102f44a 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -108,9 +108,10 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
                                             height,
                                             width);
 
-//          慢在这 ，不支持float16
+          //          慢在这 ，不支持float16
           *(grad_im + cur_bottom_grad_pos) =
-              *(grad_im + cur_bottom_grad_pos) + static_cast<T>(weight * cur_top_grad);
+              *(grad_im + cur_bottom_grad_pos) +
+              static_cast<T>(weight * cur_top_grad);
         }
       }
     }
diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
index 6bda245f3bda9..450d7a48e2bfb 100644
--- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
@@ -20,6 +20,7 @@
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
 #include "paddle/utils/optional.h"
 
 namespace phi {
@@ -39,6 +40,11 @@ void DeformableConvKernel(const Context& dev_ctx,
                           DenseTensor* out) {
   const int batch_size = static_cast<int>(x.dims()[0]);
 
+  int temp_step = std::min(64, batch_size);
+  if (batch_size % temp_step == 0) {
+    im2col_step = temp_step;
+  }
+
   using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
   std::vector<int64_t> output_shape_vec(phi::vectorize(out->dims()));
@@ -58,8 +64,9 @@ void DeformableConvKernel(const Context& dev_ctx,
   DenseTensor col_buffer = Empty<T>(dev_ctx, col_buffer_shape_vec);
   DenseTensor output_buffer = Empty<T>(dev_ctx, output_buffer_shape_vec);
 
-  int64_t M = output_shape_vec[1] / groups;
-  int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
+  int64_t M = output_shape_vec[1] / groups;  // 4 : C
+  int64_t N = im2col_step * output_shape_vec[2] *
+              output_shape_vec[3];  // 2*3*3 ：im2Step * H * W
   int64_t K = x.dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
 
   DenseTensor weight_3d;
@@ -69,11 +76,18 @@ void DeformableConvKernel(const Context& dev_ctx,
   col_buffer_3d.ShareDataWith(col_buffer)
       .Resize(phi::make_ddim({groups, K, N}));
 
-  DenseTensor output_4d;
+  DenseTensor output_4d;  //计算用 这里需要分配个空内存
   output_4d.ShareDataWith(output_buffer)
-      .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N}));
-
-  DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size());
+      .Resize(
+          phi::make_ddim({batch_size / im2col_step,
+                          groups,
+                          M,
+                          N}));  // 3 * 1 * 4 * (2*3*3) : mini_batch * group *
+                                 // C/group * (im2stap * H * W) 3 * 1 * 4 *
+                                 // (2*3*3) : mini_batch * C * (im2stap * H * W)
+
+  DDim input_shape =
+      phi::slice_ddim(x.dims(), 1, x.dims().size());  //单张图片大小：C*H*W
   std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
 
   int input_dim = x.numel() / x.dims()[0];
@@ -103,17 +117,25 @@ void DeformableConvKernel(const Context& dev_ctx,
         dilations,
         deformable_groups,
         col_buffer_ptr);
-    DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize(
-        phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
+    DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize(phi::slice_ddim(
+        output_4d.dims(),
+        1,
+        output_4d.dims().size()));  // group * C/group * (im2step * H * W)
+
     // get the product of pixel and weight
     for (int g = 0; g < groups; ++g) {
       DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
-          phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
+          phi::slice_ddim(weight_3d.dims(),
+                          1,
+                          weight_3d.dims().size()));  //等于是把第0维去掉
       DenseTensor col_buffer_3d_slice =
           col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
               col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-      DenseTensor output_3d_slice = output_3d.Slice(g, g + 1).Resize(
-          phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size()));
+      DenseTensor output_3d_slice =
+          output_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
+              output_3d.dims(),
+              1,
+              output_3d.dims().size()));  // 4*32：C * ((im2col_step)*H*W))
       blas.MatMul(weight_3d_slice,
                   false,
                   col_buffer_3d_slice,
@@ -123,7 +145,29 @@ void DeformableConvKernel(const Context& dev_ctx,
                   T(0.0));
     }
   }
-  out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec));
+
+  // 对于im2col_step大于1时的bug进行修复
+  if (im2col_step > 1) {
+    std::vector<int> axis(4);
+    axis[0] = 0;
+    axis[1] = 2;
+    axis[2] = 1;
+    axis[3] = 3;
+
+    DenseTensor real_output_buffer = phi::Transpose<T, Context>(
+        dev_ctx,
+        output_4d.Resize(
+            phi::make_ddim({batch_size / im2col_step,
+                            output_shape_vec[1],
+                            im2col_step,
+                            output_shape_vec[2] * output_shape_vec[3]})),
+        axis);
+
+    out->ShareDataWith(real_output_buffer)
+        .Resize(phi::make_ddim(output_shape_vec));
+  } else {
+    out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec));
+  }
 }
 
 }  // namespace phi
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
index 57afd9f45aa27..ea094d0406d83 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
@@ -22,7 +22,6 @@
 from op_test import OpTest
 from paddle.fluid.framework import _test_eager_guard
 
-
 paddle.enable_static()
 
 
@@ -204,7 +203,6 @@ def init_type(self):
         self.dtype = np.float32
 
 
-
 class TestWithStride(TestModulatedDeformableConvOp):
 
     def init_test_case(self):
@@ -302,7 +300,6 @@ def test_check_output(self):
 class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase):
 
     def test_error(self):
-
         def test_invalid_input():
             input = [1, 3, 32, 32]
             offset = fluid.data(name='offset',

From 9a28eff856abe1a94428bb23ac07aebd6cb35c80 Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Mon, 19 Sep 2022 11:12:18 +0800
Subject: [PATCH 06/23] modify file

---
 paddle/phi/kernels/gpu/deformable_conv_kernel.cu | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
index 31348fbb2c28d..52972a951c0a8 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
@@ -25,8 +25,4 @@ PD_REGISTER_KERNEL(deformable_conv,
                    float,
                    double,
                    phi::dtype::float16) {
-  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
-    kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32);
-    kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32);
-  }
 }

From c56849fddcae5bcde6c2262272e3648d75c8d601 Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Mon, 19 Sep 2022 11:50:22 +0800
Subject: [PATCH 07/23] modify file

---
 paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index a87da6102f44a..1d30cb73607a9 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -109,9 +109,8 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
                                             width);
 
           //          慢在这 ，不支持float16
-          *(grad_im + cur_bottom_grad_pos) =
-              *(grad_im + cur_bottom_grad_pos) +
-              static_cast<T>(weight * cur_top_grad);
+          paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos,
+                                          static_cast<T>(weight * cur_top_grad));
         }
       }
     }

From 7683477b0277e8d3b01d3c261aab7548bd5ccf72 Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Tue, 20 Sep 2022 15:53:26 +0800
Subject: [PATCH 08/23] use fastAtomicAdd

---
 paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index 1d30cb73607a9..27aec188e38fa 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -107,10 +107,11 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
                                             cur_w + dx,
                                             height,
                                             width);
-
-          //          慢在这 ，不支持float16
-          paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos,
-                                          static_cast<T>(weight * cur_top_grad));
+          paddle::platform::fastAtomicAdd<T>(
+              grad_im,
+              cur_bottom_grad_pos,
+              nthreads,
+              static_cast<T>(weight * cur_top_grad));
         }
       }
     }
@@ -365,4 +366,4 @@ PD_REGISTER_KERNEL(deformable_conv_grad,
                    phi::DeformableConvGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   paddle::platform::float16) {}

From ec5644ff58bd01dc2a51ded96f805a16b6496625 Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Wed, 21 Sep 2022 12:17:23 +0800
Subject: [PATCH 09/23] acc for ModulatedDeformableCol2im

---
 .../gpu/deformable_conv_grad_kernel.cu        | 34 ++++++-------
 .../impl/deformable_conv_grad_kernel_impl.h   | 48 ++++++++-----------
 2 files changed, 36 insertions(+), 46 deletions(-)

diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index 27aec188e38fa..d0c820460b202 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -29,7 +29,7 @@ static inline int NumBlocks(const int N) {
                   kNumMaximumNumBlocks);
 }
 
-template <typename T>
+template <typename T, typename MT>
 __global__ void ModulatedDeformableCol2imGpuKernel(
     const int nthreads,
     const T* data_col,
@@ -51,10 +51,10 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
     const int deformable_group,
     const int height_col,
     const int width_col,
-    T* grad_im) {
+    MT* grad_im) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  // using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   for (size_t thread = index; thread < nthreads; thread += offset) {
     const int j = (thread / width_col / height_col / batch_size) % kernel_w;
     const int i =
@@ -107,18 +107,15 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
                                             cur_w + dx,
                                             height,
                                             width);
-          paddle::platform::fastAtomicAdd<T>(
-              grad_im,
-              cur_bottom_grad_pos,
-              nthreads,
-              static_cast<T>(weight * cur_top_grad));
+          paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos,
+                                          weight * cur_top_grad);
         }
       }
     }
   }
 }
 
-template <typename T, typename Context>
+template <typename T, typename MT, typename Context>
 void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const T* data_col,
                                const T* data_offset,
@@ -130,13 +127,13 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const std::vector<int>& stride,
                                const std::vector<int>& dilation,
                                const int deformable_group,
-                               T* grad_im) {
+                               MT* grad_im) {
   int channel_per_deformable_group = im_shape[0] / deformable_group;
   int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
   int blocks = NumBlocks(num_kernels);
   int threads = kNumCUDAThreads;
 
-  ModulatedDeformableCol2imGpuKernel<T>
+  ModulatedDeformableCol2imGpuKernel<T, MT>
       <<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
                                                  data_col,
                                                  data_offset,
@@ -243,14 +240,13 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
       if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
         inv_h = inv_w = -2;
       } else {
-        mval +=
-            static_cast<MT>(data_col_ptr[col_pos]) *
-            funcs::DmcnIm2colBilinear<T, MT>(data_im_ptr + cnt * height * width,
-                                             width,
-                                             height,
-                                             width,
-                                             inv_h,
-                                             inv_w);
+        mval += static_cast<MT>(data_col_ptr[col_pos]) *
+                funcs::DmcnIm2colBilinear<T, MT>(data_im_ptr + cnt * height * width,
+                                                 width,
+                                                 height,
+                                                 width,
+                                                 inv_h,
+                                                 inv_w);
       }
       const MT weight =
           DmcnGetCoordinateWeight<T, MT>(inv_h,
diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index 90a5243fc72d4..ccbe3aa21b4e7 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -14,13 +14,13 @@
 
 #pragma once
 
-#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
+#include "paddle/phi/common/amp_type_traits.h"
 
 namespace phi {
 
@@ -82,46 +82,38 @@ HOSTDEVICE MT DmcnGetCoordinateWeight(MT argmax_h,
   if (bp_dir == 0) {
     weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
                   ? -1 * (argmax_w_low + 1 - argmax_w) *
-                        static_cast<MT>(
-                            im_data[argmax_h_low * data_width + argmax_w_low])
+                        static_cast<MT>(im_data[argmax_h_low * data_width + argmax_w_low])
                   : 0;
 
     weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
                   ? -1 * (argmax_w - argmax_w_low) *
-                        static_cast<MT>(
-                            im_data[argmax_h_low * data_width + argmax_w_high])
+                        static_cast<MT>(im_data[argmax_h_low * data_width + argmax_w_high])
                   : 0;
 
     weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
                   ? (argmax_w_low + 1 - argmax_w) *
-                        static_cast<MT>(
-                            im_data[argmax_h_high * data_width + argmax_w_low])
+                        static_cast<MT>(im_data[argmax_h_high * data_width + argmax_w_low])
                   : 0;
     weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
                   ? (argmax_w - argmax_w_low) *
-                        static_cast<MT>(
-                            im_data[argmax_h_high * data_width + argmax_w_high])
+                        static_cast<MT>(im_data[argmax_h_high * data_width + argmax_w_high])
                   : 0;
   } else if (bp_dir == 1) {
     weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
                   ? -1 * (argmax_h_low + 1 - argmax_h) *
-                        static_cast<MT>(
-                            im_data[argmax_h_low * data_width + argmax_w_low])
+                        static_cast<MT>(im_data[argmax_h_low * data_width + argmax_w_low])
                   : 0;
     weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
                   ? (argmax_h_low + 1 - argmax_h) *
-                        static_cast<MT>(
-                            im_data[argmax_h_low * data_width + argmax_w_high])
+                        static_cast<MT>(im_data[argmax_h_low * data_width + argmax_w_high])
                   : 0;
     weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
                   ? -1 * (argmax_h - argmax_h_low) *
-                        static_cast<MT>(
-                            im_data[argmax_h_high * data_width + argmax_w_low])
+                        static_cast<MT>(im_data[argmax_h_high * data_width + argmax_w_low])
                   : 0;
     weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
                   ? (argmax_h - argmax_h_low) *
-                        static_cast<MT>(
-                            im_data[argmax_h_high * data_width + argmax_w_high])
+                        static_cast<MT>(im_data[argmax_h_high * data_width + argmax_w_high])
                   : 0;
   }
 
@@ -144,7 +136,7 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
                                     T* grad_offset,
                                     T* grad_mask);
 
-template <typename T, typename Context>
+template <typename T, typename MT, typename Context>
 void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const T* data_col,
                                const T* data_offset,
@@ -156,7 +148,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const std::vector<int>& stride,
                                const std::vector<int>& dilation,
                                const int deformable_group,
-                               T* grad_im);
+                               MT* grad_im);
 
 template <typename T, typename Context>
 void FilterGradAddup(const Context& dev_ctx,
@@ -167,6 +159,7 @@ void FilterGradAddup(const Context& dev_ctx,
                      const T* dweight_3d,
                      T* filter_grad);
 
+// 总入口
 template <typename T, typename Context>
 void DeformableConvGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
@@ -185,7 +178,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                               DenseTensor* filter_grad,
                               DenseTensor* mask_grad) {
   const int batch_size = static_cast<int>(x.dims()[0]);
-
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size());
   std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
   std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
@@ -270,7 +263,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                   T(1.0),
                   &col_buffer_3d_slice,
                   T(0.0));
-    }
+    } // 相信没啥问题
     col_buffer.Resize(make_ddim(col_buffer_shape_vec));
 
     T* col_buffer_ptr = col_buffer.data<T>();
@@ -299,10 +292,11 @@ void DeformableConvGradKernel(const Context& dev_ctx,
           deformable_groups,
           offset_grad_ptr + i * im2col_step * input_offset_dim,
           mask_grad_data_ptr);
-    }  // check
+    } //check
     if (dx) {
-      T* dx_ptr = dx->data<T>();
-      // get grad of input
+
+      MT* mt_dx_ptr = dev_ctx.template Alloc<MT>(dx);
+
       ModulatedDeformableCol2im(dev_ctx,
                                 col_buffer_ptr,
                                 offset_ptr + i * im2col_step * input_offset_dim,
@@ -314,7 +308,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                                 strides,
                                 dilations,
                                 deformable_groups,
-                                dx_ptr + i * im2col_step * input_dim);  //待改
+                                mt_dx_ptr + i * im2col_step * input_dim);
       dx->Resize(x.dims());
     }
 
@@ -330,7 +324,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
         strides,
         dilations,
         deformable_groups,
-        col_buffer_ptr);  // check
+        col_buffer_ptr); //check
 
     col_buffer_3d.Resize(col_buffer_3d_shape);
 
@@ -362,7 +356,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                          K,
                          M,
                          dweight_3d.data<T>(),
-                         filter_grad->data<T>());  // 待改
+                         filter_grad->data<T>());
     }
   }
   if (filter_grad) {

From 4538bb5d927ff93e6790de3b302bccea50b15376 Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Thu, 22 Sep 2022 20:43:42 +0800
Subject: [PATCH 10/23] modify codestyle

---
 .../cpu/deformable_conv_grad_kernel.cc        |  4 +--
 .../gpu/deformable_conv_grad_kernel.cu        | 15 +++++----
 .../impl/deformable_conv_grad_kernel_impl.h   | 33 +++++++++++--------
 .../unittests/test_deformable_conv_v1_op.py   |  1 +
 4 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
index 962576154a7bf..537f0de367693 100644
--- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
@@ -106,7 +106,7 @@ inline void ModulatedDeformableCol2imCPUKernel(
   }
 }
 
-template <typename T, typename Context>
+template <typename T, typename MT, typename Context>
 void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const T* data_col,
                                const T* data_offset,
@@ -118,7 +118,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const std::vector<int>& stride,
                                const std::vector<int>& dilation,
                                const int deformable_group,
-                               T* grad_im) {
+                               MT* grad_im) {
   int channel_per_deformable_group = im_shape[0] / deformable_group;
   int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
 
diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index d0c820460b202..5d2f4727c53e2 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -240,13 +240,14 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
       if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
         inv_h = inv_w = -2;
       } else {
-        mval += static_cast<MT>(data_col_ptr[col_pos]) *
-                funcs::DmcnIm2colBilinear<T, MT>(data_im_ptr + cnt * height * width,
-                                                 width,
-                                                 height,
-                                                 width,
-                                                 inv_h,
-                                                 inv_w);
+        mval +=
+            static_cast<MT>(data_col_ptr[col_pos]) *
+            funcs::DmcnIm2colBilinear<T, MT>(data_im_ptr + cnt * height * width,
+                                             width,
+                                             height,
+                                             width,
+                                             inv_h,
+                                             inv_w);
       }
       const MT weight =
           DmcnGetCoordinateWeight<T, MT>(inv_h,
diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index ccbe3aa21b4e7..af620de132cfa 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -14,13 +14,13 @@
 
 #pragma once
 
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-#include "paddle/phi/common/amp_type_traits.h"
 
 namespace phi {
 
@@ -82,38 +82,46 @@ HOSTDEVICE MT DmcnGetCoordinateWeight(MT argmax_h,
   if (bp_dir == 0) {
     weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
                   ? -1 * (argmax_w_low + 1 - argmax_w) *
-                        static_cast<MT>(im_data[argmax_h_low * data_width + argmax_w_low])
+                        static_cast<MT>(
+                            im_data[argmax_h_low * data_width + argmax_w_low])
                   : 0;
 
     weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
                   ? -1 * (argmax_w - argmax_w_low) *
-                        static_cast<MT>(im_data[argmax_h_low * data_width + argmax_w_high])
+                        static_cast<MT>(
+                            im_data[argmax_h_low * data_width + argmax_w_high])
                   : 0;
 
     weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
                   ? (argmax_w_low + 1 - argmax_w) *
-                        static_cast<MT>(im_data[argmax_h_high * data_width + argmax_w_low])
+                        static_cast<MT>(
+                            im_data[argmax_h_high * data_width + argmax_w_low])
                   : 0;
     weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
                   ? (argmax_w - argmax_w_low) *
-                        static_cast<MT>(im_data[argmax_h_high * data_width + argmax_w_high])
+                        static_cast<MT>(
+                            im_data[argmax_h_high * data_width + argmax_w_high])
                   : 0;
   } else if (bp_dir == 1) {
     weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
                   ? -1 * (argmax_h_low + 1 - argmax_h) *
-                        static_cast<MT>(im_data[argmax_h_low * data_width + argmax_w_low])
+                        static_cast<MT>(
+                            im_data[argmax_h_low * data_width + argmax_w_low])
                   : 0;
     weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
                   ? (argmax_h_low + 1 - argmax_h) *
-                        static_cast<MT>(im_data[argmax_h_low * data_width + argmax_w_high])
+                        static_cast<MT>(
+                            im_data[argmax_h_low * data_width + argmax_w_high])
                   : 0;
     weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
                   ? -1 * (argmax_h - argmax_h_low) *
-                        static_cast<MT>(im_data[argmax_h_high * data_width + argmax_w_low])
+                        static_cast<MT>(
+                            im_data[argmax_h_high * data_width + argmax_w_low])
                   : 0;
     weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
                   ? (argmax_h - argmax_h_low) *
-                        static_cast<MT>(im_data[argmax_h_high * data_width + argmax_w_high])
+                        static_cast<MT>(
+                            im_data[argmax_h_high * data_width + argmax_w_high])
                   : 0;
   }
 
@@ -263,7 +271,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                   T(1.0),
                   &col_buffer_3d_slice,
                   T(0.0));
-    } // 相信没啥问题
+    }  // 相信没啥问题
     col_buffer.Resize(make_ddim(col_buffer_shape_vec));
 
     T* col_buffer_ptr = col_buffer.data<T>();
@@ -292,9 +300,8 @@ void DeformableConvGradKernel(const Context& dev_ctx,
           deformable_groups,
           offset_grad_ptr + i * im2col_step * input_offset_dim,
           mask_grad_data_ptr);
-    } //check
+    }  // check
     if (dx) {
-
       MT* mt_dx_ptr = dev_ctx.template Alloc<MT>(dx);
 
       ModulatedDeformableCol2im(dev_ctx,
@@ -324,7 +331,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
         strides,
         dilations,
         deformable_groups,
-        col_buffer_ptr); //check
+        col_buffer_ptr);  // check
 
     col_buffer_3d.Resize(col_buffer_3d_shape);
 
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
index ea094d0406d83..6e70a8487e6a6 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
@@ -300,6 +300,7 @@ def test_check_output(self):
 class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase):
 
     def test_error(self):
+
         def test_invalid_input():
             input = [1, 3, 32, 32]
             offset = fluid.data(name='offset',

From e0c75aff4beda947563dbbf45d20db77acb026de Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Mon, 26 Sep 2022 16:36:37 +0800
Subject: [PATCH 11/23] remove useless notes

---
 .../kernels/funcs/deformable_conv_functor.cu  |  2 --
 .../impl/deformable_conv_grad_kernel_impl.h   |  7 ++---
 .../impl/deformable_conv_kernel_impl.h        | 28 ++++++-------------
 3 files changed, 12 insertions(+), 25 deletions(-)

diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
index 7ae2526406f2b..8a23c0d26d424 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
@@ -82,8 +82,6 @@ __global__ void ModulatedDeformableIm2colGpuKernel(
                               kernel_h * kernel_w * height_col * width_col
             : nullptr;
 
-    // 上边都动不了，指针移动也不会引起误差和速度降低
-
     for (int i = 0; i < kernel_h; ++i) {
       for (int j = 0; j < kernel_w; ++j) {
         const int data_offset_h_ptr =
diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index af620de132cfa..4828a02cfefa0 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -167,7 +167,6 @@ void FilterGradAddup(const Context& dev_ctx,
                      const T* dweight_3d,
                      T* filter_grad);
 
-// 总入口
 template <typename T, typename Context>
 void DeformableConvGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
@@ -271,7 +270,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                   T(1.0),
                   &col_buffer_3d_slice,
                   T(0.0));
-    }  // 相信没啥问题
+    }
     col_buffer.Resize(make_ddim(col_buffer_shape_vec));
 
     T* col_buffer_ptr = col_buffer.data<T>();
@@ -300,7 +299,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
           deformable_groups,
           offset_grad_ptr + i * im2col_step * input_offset_dim,
           mask_grad_data_ptr);
-    }  // check
+    }
     if (dx) {
       MT* mt_dx_ptr = dev_ctx.template Alloc<MT>(dx);
 
@@ -331,7 +330,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
         strides,
         dilations,
         deformable_groups,
-        col_buffer_ptr);  // check
+        col_buffer_ptr);
 
     col_buffer_3d.Resize(col_buffer_3d_shape);
 
diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
index 450d7a48e2bfb..d66f4e58e5b61 100644
--- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
@@ -64,9 +64,8 @@ void DeformableConvKernel(const Context& dev_ctx,
   DenseTensor col_buffer = Empty<T>(dev_ctx, col_buffer_shape_vec);
   DenseTensor output_buffer = Empty<T>(dev_ctx, output_buffer_shape_vec);
 
-  int64_t M = output_shape_vec[1] / groups;  // 4 : C
-  int64_t N = im2col_step * output_shape_vec[2] *
-              output_shape_vec[3];  // 2*3*3 ：im2Step * H * W
+  int64_t M = output_shape_vec[1] / groups;
+  int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
   int64_t K = x.dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
 
   DenseTensor weight_3d;
@@ -76,18 +75,11 @@ void DeformableConvKernel(const Context& dev_ctx,
   col_buffer_3d.ShareDataWith(col_buffer)
       .Resize(phi::make_ddim({groups, K, N}));
 
-  DenseTensor output_4d;  //计算用 这里需要分配个空内存
+  DenseTensor output_4d;
   output_4d.ShareDataWith(output_buffer)
-      .Resize(
-          phi::make_ddim({batch_size / im2col_step,
-                          groups,
-                          M,
-                          N}));  // 3 * 1 * 4 * (2*3*3) : mini_batch * group *
-                                 // C/group * (im2stap * H * W) 3 * 1 * 4 *
-                                 // (2*3*3) : mini_batch * C * (im2stap * H * W)
-
-  DDim input_shape =
-      phi::slice_ddim(x.dims(), 1, x.dims().size());  //单张图片大小：C*H*W
+      .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N}));
+
+  DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size());
   std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
 
   int input_dim = x.numel() / x.dims()[0];
@@ -125,9 +117,7 @@ void DeformableConvKernel(const Context& dev_ctx,
     // get the product of pixel and weight
     for (int g = 0; g < groups; ++g) {
       DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
-          phi::slice_ddim(weight_3d.dims(),
-                          1,
-                          weight_3d.dims().size()));  //等于是把第0维去掉
+          phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
       DenseTensor col_buffer_3d_slice =
           col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
               col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
@@ -135,7 +125,7 @@ void DeformableConvKernel(const Context& dev_ctx,
           output_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
               output_3d.dims(),
               1,
-              output_3d.dims().size()));  // 4*32：C * ((im2col_step)*H*W))
+              output_3d.dims().size()));  // C * ((im2col_step)*H*W))
       blas.MatMul(weight_3d_slice,
                   false,
                   col_buffer_3d_slice,
@@ -146,7 +136,7 @@ void DeformableConvKernel(const Context& dev_ctx,
     }
   }
 
-  // 对于im2col_step大于1时的bug进行修复
+  //  swap axis to get the right result when im2col_step is greater than 1
   if (im2col_step > 1) {
     std::vector<int> axis(4);
     axis[0] = 0;

From ab8aeec150e440a45f9bbbe105a46a3c53017fc4 Mon Sep 17 00:00:00 2001
From: Rayman <lqr5259@outlook.com>
Date: Thu, 29 Sep 2022 11:40:28 +0800
Subject: [PATCH 12/23] Update deformable_conv_kernel.cu

---
 paddle/phi/kernels/gpu/deformable_conv_kernel.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
index 52972a951c0a8..021791ca93061 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
@@ -24,5 +24,4 @@ PD_REGISTER_KERNEL(deformable_conv,
                    phi::DeformableConvKernel,
                    float,
                    double,
-                   phi::dtype::float16) {
-}
+                   phi::dtype::float16) {}

From b697128fd55b573fb992b887736912497f287f7e Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Sat, 8 Oct 2022 09:52:14 +0800
Subject: [PATCH 13/23] modify unit_test

---
 .../fluid/tests/unittests/test_deformable_conv_v1_op.py   | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
index 58147d9e038b5..6ce1aaaa5d79c 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
@@ -294,11 +294,17 @@ def init_type(self):
     def test_check_output(self):
         self.check_output(check_eager=True, atol=1e-3)
 
+    def test_check_grad(self):
+        self.check_grad_with_place(core.CUDAPlace(0), ['Input', 'Offset', 'Filter'],
+                                   'Output',
+                                   numeric_grad_delta=1e-3,
+                                   max_relative_error=1e-3,
+                                   check_eager=True)
+
 
 class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase):
 
     def test_error(self):
-
         def test_invalid_input():
             input = [1, 3, 32, 32]
             offset = fluid.data(name='offset',

From 242d9a6b9bddea874a512c54a4936c397ae1983e Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Sat, 8 Oct 2022 21:46:26 +0800
Subject: [PATCH 14/23] modify test

---
 .../fluid/tests/unittests/test_deformable_conv_v1_op.py  | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
index 6ce1aaaa5d79c..d729c4bdf3d9c 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
@@ -286,6 +286,8 @@ def init_type(self):
         self.dtype = np.float64
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestWithFloat16(TestModulatedDeformableConvOp):
 
     def init_type(self):
@@ -294,13 +296,6 @@ def init_type(self):
     def test_check_output(self):
         self.check_output(check_eager=True, atol=1e-3)
 
-    def test_check_grad(self):
-        self.check_grad_with_place(core.CUDAPlace(0), ['Input', 'Offset', 'Filter'],
-                                   'Output',
-                                   numeric_grad_delta=1e-3,
-                                   max_relative_error=1e-3,
-                                   check_eager=True)
-
 
 class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase):
 

From daf5b2a2e921a5418b83fd54b8c0492fdf68b0f2 Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Sun, 9 Oct 2022 09:06:34 +0800
Subject: [PATCH 15/23] modify test

---
 .../unittests/test_deformable_conv_v1_op.py      | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
index d729c4bdf3d9c..54705e108ed41 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
@@ -296,10 +296,26 @@ def init_type(self):
     def test_check_output(self):
         self.check_output(check_eager=True, atol=1e-3)
 
+    def test_check_grad(self):
+        self.check_grad(['Input', 'Filter'],
+                        'Output',
+                        numeric_grad_delta=3e-1,
+                        max_relative_error=4e-1,
+                        check_eager=True)
+
+    def test_check_grad_no_filter(self):
+        self.check_grad(['Input'],
+                        'Output',
+                        numeric_grad_delta=3e-1,
+                        max_relative_error=2e-1,
+                        no_grad_set=set(['Filter']),
+                        check_eager=True)
+
 
 class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase):
 
     def test_error(self):
+
         def test_invalid_input():
             input = [1, 3, 32, 32]
             offset = fluid.data(name='offset',

From f1b463afa9ec16a911b4956202866d38293fb07f Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Sun, 9 Oct 2022 11:53:35 +0800
Subject: [PATCH 16/23] modify when type16

---
 .../kernels/funcs/deformable_conv_functor.cc  |  1 -
 .../kernels/funcs/deformable_conv_functor.cu  |  2 +-
 .../impl/deformable_conv_grad_kernel_impl.h   | 51 ++++++++++++++-----
 3 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
index f6483b9cdd2e8..d5b99760f9a45 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
index 8a23c0d26d424..813d549136084 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index 4828a02cfefa0..ab6d226b03c56 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -21,6 +21,7 @@
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
+#include "paddle/phi/kernels/cast_kernel.h"
 
 namespace phi {
 
@@ -301,21 +302,43 @@ void DeformableConvGradKernel(const Context& dev_ctx,
           mask_grad_data_ptr);
     }
     if (dx) {
-      MT* mt_dx_ptr = dev_ctx.template Alloc<MT>(dx);
-
-      ModulatedDeformableCol2im(dev_ctx,
-                                col_buffer_ptr,
-                                offset_ptr + i * im2col_step * input_offset_dim,
-                                mask_data_ptr,
-                                input_shape_vec,
-                                col_buffer_shape_vec,
-                                filter_shape_vec,
-                                paddings,
-                                strides,
-                                dilations,
-                                deformable_groups,
-                                mt_dx_ptr + i * im2col_step * input_dim);
+      if (x.dtype() == DataType::FLOAT16){
+        DenseTensor mt_dx = phi::EmptyLike<MT, Context>(dev_ctx, *dx);
+        MT* mt_dx_ptr = dev_ctx.template Alloc<MT>(&mt_dx);
+
+        ModulatedDeformableCol2im(dev_ctx,
+                                  col_buffer_ptr,
+                                  offset_ptr + i * im2col_step * input_offset_dim,
+                                  mask_data_ptr,
+                                  input_shape_vec,
+                                  col_buffer_shape_vec,
+                                  filter_shape_vec,
+                                  paddings,
+                                  strides,
+                                  dilations,
+                                  deformable_groups,
+                                  mt_dx_ptr + i * im2col_step * input_dim);
+
+        DenseTensor t_dx = phi::Cast<MT, Context>(dev_ctx, mt_dx, x.dtype());
+        dx->ShareDataWith(t_dx);
+      } else {
+        MT* mt_dx_ptr = dev_ctx.template Alloc<MT>(dx);
+        ModulatedDeformableCol2im(dev_ctx,
+                                  col_buffer_ptr,
+                                  offset_ptr + i * im2col_step * input_offset_dim,
+                                  mask_data_ptr,
+                                  input_shape_vec,
+                                  col_buffer_shape_vec,
+                                  filter_shape_vec,
+                                  paddings,
+                                  strides,
+                                  dilations,
+                                  deformable_groups,
+                                  mt_dx_ptr + i * im2col_step * input_dim);
+
+      }
       dx->Resize(x.dims());
+
     }
 
     funcs::ModulatedDeformableIm2col(

From 916abf16d57fe74a812ec8618f6b8ae0d7c178f2 Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Sun, 9 Oct 2022 13:19:01 +0800
Subject: [PATCH 17/23] skip grad check for fp16

---
 .../cpu/deformable_conv_grad_kernel.cc        | 76 +++++++++----------
 .../kernels/funcs/deformable_conv_functor.cc  | 16 ++--
 .../unittests/test_deformable_conv_v1_op.py   | 18 ++---
 3 files changed, 48 insertions(+), 62 deletions(-)

diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
index 537f0de367693..d38310360d0ae 100644
--- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
@@ -43,7 +43,6 @@ inline void ModulatedDeformableCol2imCPUKernel(
     const int height_col,
     const int width_col,
     T* grad_im) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   for (int thread = 0; thread < num_kernels; thread++) {
     const int j = (thread / width_col / height_col / batch_size) % kernel_w;
     const int i =
@@ -68,17 +67,17 @@ inline void ModulatedDeformableCol2imCPUKernel(
         ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
     const int data_mask_hw_ptr =
         ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
-    const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
-    const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
-    const MT cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const MT cur_inv_w_data = w_in + j * dilation_w + offset_w;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
 
-    MT cur_top_grad = static_cast<MT>(data_col[thread]);
+    T cur_top_grad = data_col[thread];
     if (data_mask) {
       const T* data_mask_ptr =
           data_mask + (b * deformable_group + deformable_group_index) *
                           kernel_h * kernel_w * height_col * width_col;
-      const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
+      const T mask = data_mask_ptr[data_mask_hw_ptr];
       cur_top_grad *= mask;
     }
     const int cur_h = static_cast<int>(cur_inv_h_data);
@@ -90,16 +89,15 @@ inline void ModulatedDeformableCol2imCPUKernel(
             abs(cur_inv_w_data - (cur_w + dx)) < 1) {
           int cur_bottom_grad_pos =
               ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          MT weight = DmcnGetGradientWeight(cur_inv_h_data,
-                                            cur_inv_w_data,
-                                            cur_h + dy,
-                                            cur_w + dx,
-                                            height,
-                                            width);
+          T weight = DmcnGetGradientWeight(cur_inv_h_data,
+                                           cur_inv_w_data,
+                                           cur_h + dy,
+                                           cur_w + dx,
+                                           height,
+                                           width);
 
           *(grad_im + cur_bottom_grad_pos) =
-              *(grad_im + cur_bottom_grad_pos) +
-              static_cast<T>(weight * cur_top_grad);
+              *(grad_im + cur_bottom_grad_pos) + (weight * cur_top_grad);
         }
       }
     }
@@ -171,9 +169,8 @@ void ModulatedDeformableCol2imCoordCPUKernel(
     const int width_col,
     T* grad_offset,
     T* grad_mask) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   for (int i = 0; i < num_kernels; i++) {
-    MT val = 0, mval = 0;
+    T val = 0, mval = 0;
     const int w = i % width_col;
     const int h = (i / width_col) % height_col;
     const int c = (i / width_col / height_col) % offset_channels;
@@ -218,37 +215,36 @@ void ModulatedDeformableCol2imCoordCPUKernel(
       const int data_offset_w_ptr =
           (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
            w_out);
-      const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
-      const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
-      MT inv_h = h_in + i * dilation_h + offset_h;
-      MT inv_w = w_in + j * dilation_w + offset_w;
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
       if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
         inv_h = inv_w = -2;
       } else {
-        mval +=
-            static_cast<MT>(data_col_ptr[col_pos]) *
-            funcs::DmcnIm2colBilinear<T, MT>(data_im_ptr + cnt * height * width,
-                                             width,
-                                             height,
-                                             width,
-                                             inv_h,
-                                             inv_w);
+        mval += data_col_ptr[col_pos] * funcs::DmcnIm2colBilinear<T, T>(
+                                            data_im_ptr + cnt * height * width,
+                                            width,
+                                            height,
+                                            width,
+                                            inv_h,
+                                            inv_w);
       }
-      const MT weight =
-          DmcnGetCoordinateWeight<T, MT>(inv_h,
-                                         inv_w,
-                                         height,
-                                         width,
-                                         data_im_ptr + cnt * height * width,
-                                         width,
-                                         bp_dir);
+      const T weight =
+          DmcnGetCoordinateWeight<T, T>(inv_h,
+                                        inv_w,
+                                        height,
+                                        width,
+                                        data_im_ptr + cnt * height * width,
+                                        width,
+                                        bp_dir);
       if (data_mask_ptr) {
         const int data_mask_hw_ptr =
             (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
-        const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
-        val += weight * static_cast<MT>(data_col_ptr[col_pos]) * mask;
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        val += weight * data_col_ptr[col_pos] * mask;
       } else {
-        val += weight * static_cast<MT>(data_col_ptr[col_pos]);
+        val += weight * data_col_ptr[col_pos];
       }
       cnt += 1;
     }
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
index d5b99760f9a45..226fa51bc335b 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 
@@ -43,7 +42,6 @@ inline void ModulatedDeformableIm2colCPUKernel(
     const int height_col,
     const int width_col,
     T* data_col) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   for (int i = 0; i < num_kernels; i++) {
     const int w_col = i % width_col;
     const int h_col = (i / width_col) % height_col;
@@ -78,19 +76,19 @@ inline void ModulatedDeformableIm2colCPUKernel(
             ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
             w_col;
 
-        const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
-        const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
-        MT val = static_cast<MT>(0);
-        const MT h_im = h_in + i * dilation_h + offset_h;
-        const MT w_im = w_in + j * dilation_w + offset_w;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        T val = 0;
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
         if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
-          val = DmcnIm2colBilinear<T, MT>(
+          val = DmcnIm2colBilinear<T, T>(
               data_im_ptr, width, height, width, h_im, w_im);
         }
         if (data_mask_ptr) {
           const int data_mask_hw_ptr =
               ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
-          const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
+          const T mask = data_mask_ptr[data_mask_hw_ptr];
           val *= mask;
         }
         *data_col_ptr = static_cast<T>(val);
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
index 54705e108ed41..c800881ee98a9 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from op_test import OpTest
+from op_test import OpTest, skip_check_grad_ci
 from paddle.fluid.framework import _test_eager_guard
 
 paddle.enable_static()
@@ -288,6 +288,8 @@ def init_type(self):
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
+@skip_check_grad_ci(
+    reason="Grad check for deform-fp16 need to be compared with fp32, will be included in another PR")
 class TestWithFloat16(TestModulatedDeformableConvOp):
 
     def init_type(self):
@@ -297,25 +299,15 @@ def test_check_output(self):
         self.check_output(check_eager=True, atol=1e-3)
 
     def test_check_grad(self):
-        self.check_grad(['Input', 'Filter'],
-                        'Output',
-                        numeric_grad_delta=3e-1,
-                        max_relative_error=4e-1,
-                        check_eager=True)
+        pass
 
     def test_check_grad_no_filter(self):
-        self.check_grad(['Input'],
-                        'Output',
-                        numeric_grad_delta=3e-1,
-                        max_relative_error=2e-1,
-                        no_grad_set=set(['Filter']),
-                        check_eager=True)
+        pass
 
 
 class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase):
 
     def test_error(self):
-
         def test_invalid_input():
             input = [1, 3, 32, 32]
             offset = fluid.data(name='offset',

From f1fec53289d0f2640aad79ee5460767ad132758d Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Sun, 9 Oct 2022 13:28:37 +0800
Subject: [PATCH 18/23] code style

---
 .../kernels/funcs/deformable_conv_functor.cu  |  2 +-
 .../impl/deformable_conv_grad_kernel_impl.h   | 56 +++++++++----------
 .../unittests/test_deformable_conv_v1_op.py   |  1 +
 3 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
index 813d549136084..0d5076a4937c3 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/device_context.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index ab6d226b03c56..d68619728cd1f 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -17,11 +17,11 @@
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-#include "paddle/phi/kernels/cast_kernel.h"
 
 namespace phi {
 
@@ -302,43 +302,43 @@ void DeformableConvGradKernel(const Context& dev_ctx,
           mask_grad_data_ptr);
     }
     if (dx) {
-      if (x.dtype() == DataType::FLOAT16){
+      if (x.dtype() == DataType::FLOAT16) {
         DenseTensor mt_dx = phi::EmptyLike<MT, Context>(dev_ctx, *dx);
         MT* mt_dx_ptr = dev_ctx.template Alloc<MT>(&mt_dx);
 
-        ModulatedDeformableCol2im(dev_ctx,
-                                  col_buffer_ptr,
-                                  offset_ptr + i * im2col_step * input_offset_dim,
-                                  mask_data_ptr,
-                                  input_shape_vec,
-                                  col_buffer_shape_vec,
-                                  filter_shape_vec,
-                                  paddings,
-                                  strides,
-                                  dilations,
-                                  deformable_groups,
-                                  mt_dx_ptr + i * im2col_step * input_dim);
+        ModulatedDeformableCol2im(
+            dev_ctx,
+            col_buffer_ptr,
+            offset_ptr + i * im2col_step * input_offset_dim,
+            mask_data_ptr,
+            input_shape_vec,
+            col_buffer_shape_vec,
+            filter_shape_vec,
+            paddings,
+            strides,
+            dilations,
+            deformable_groups,
+            mt_dx_ptr + i * im2col_step * input_dim);
 
         DenseTensor t_dx = phi::Cast<MT, Context>(dev_ctx, mt_dx, x.dtype());
         dx->ShareDataWith(t_dx);
       } else {
         MT* mt_dx_ptr = dev_ctx.template Alloc<MT>(dx);
-        ModulatedDeformableCol2im(dev_ctx,
-                                  col_buffer_ptr,
-                                  offset_ptr + i * im2col_step * input_offset_dim,
-                                  mask_data_ptr,
-                                  input_shape_vec,
-                                  col_buffer_shape_vec,
-                                  filter_shape_vec,
-                                  paddings,
-                                  strides,
-                                  dilations,
-                                  deformable_groups,
-                                  mt_dx_ptr + i * im2col_step * input_dim);
-
+        ModulatedDeformableCol2im(
+            dev_ctx,
+            col_buffer_ptr,
+            offset_ptr + i * im2col_step * input_offset_dim,
+            mask_data_ptr,
+            input_shape_vec,
+            col_buffer_shape_vec,
+            filter_shape_vec,
+            paddings,
+            strides,
+            dilations,
+            deformable_groups,
+            mt_dx_ptr + i * im2col_step * input_dim);
       }
       dx->Resize(x.dims());
-
     }
 
     funcs::ModulatedDeformableIm2col(
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
index c800881ee98a9..b41ef28c79b14 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
@@ -308,6 +308,7 @@ def test_check_grad_no_filter(self):
 class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase):
 
     def test_error(self):
+
         def test_invalid_input():
             input = [1, 3, 32, 32]
             offset = fluid.data(name='offset',

From e2c7d3846b4972234e499091a46fe5efd9e079ac Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Sun, 9 Oct 2022 13:43:00 +0800
Subject: [PATCH 19/23] add fp16 grad check

---
 .../unittests/test_deformable_conv_v1_op.py   | 69 +++++++++++++++----
 1 file changed, 54 insertions(+), 15 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
index b41ef28c79b14..b60a818c447ad 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
@@ -285,25 +285,64 @@ class TestWithDouble(TestModulatedDeformableConvOp):
     def init_type(self):
         self.dtype = np.float64
 
+class TestFP16(unittest.TestCase):
+
+    def check_main(self, input_np, offset_np, filter_np, dtype):
+        paddle.disable_static()
+        input_np = input_np.astype(dtype)
+        offset_np = offset_np.astype(dtype)
+        filter_np = filter_np.astype(dtype)
+
+        input = paddle.to_tensor(input_np)
+        offset = paddle.to_tensor(offset_np)
+        filter = paddle.to_tensor(filter_np)
+
+        input.stop_gradient = False
+        offset.stop_gradient = False
+        filter.stop_gradient = False
+
+        y = paddle.vision.ops.deform_conv2d(input, offset, filter)
+        input_grad, offset_grad, filter_grad = paddle.grad(y, [input, offset, filter])
+        y_np = y.numpy().astype('float32')
+        input_grad_np = input_grad.numpy().astype('float32')
+        offset_grad_np = offset_grad.numpy().astype('float32')
+        filter_grad_np = filter_grad.numpy().astype('float32')
+        paddle.enable_static()
+        return y_np, input_grad_np, offset_grad_np, filter_grad_np
+
+    def test_main(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 1
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [40, f_c, 1, 1]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = 2 * self.deformable_groups * self.filter_size[
+            2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0], offset_c, self.input_size[2], self.input_size[3]
+        ]
 
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-@skip_check_grad_ci(
-    reason="Grad check for deform-fp16 need to be compared with fp32, will be included in another PR")
-class TestWithFloat16(TestModulatedDeformableConvOp):
-
-    def init_type(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        self.check_output(check_eager=True, atol=1e-3)
+        input = np.random.random(self.input_size)
+        offset = 10 * np.random.random(self.offset_size)
+        filter = np.random.random(self.filter_size)
 
-    def test_check_grad(self):
-        pass
+        y_np_1, input_g_np_1, offset_g_np_1, filter_g_np_1 = self.check_main(input,offset,filter, 'float16')
+        y_np_2, input_g_np_2, offset_g_np_2, filter_g_np_2 = self.check_main(input,offset,filter, 'float32')
 
-    def test_check_grad_no_filter(self):
-        pass
+        def assert_equal(x, y):
+            np.testing.assert_allclose(x, y, atol=3e-2)
 
+        assert_equal(y_np_1, y_np_2)
+        assert_equal(input_g_np_1, input_g_np_2)
+        assert_equal(offset_g_np_1, offset_g_np_2)
+        assert_equal(filter_g_np_1, filter_g_np_2)
 
 class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase):
 

From 15183b1496739d73aa90bcaaca3a7d9786b5c931 Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Sun, 9 Oct 2022 13:46:39 +0800
Subject: [PATCH 20/23] code style

---
 .../fluid/tests/unittests/test_deformable_conv_v1_op.py     | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
index b60a818c447ad..7e55b9e0673aa 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
@@ -285,6 +285,7 @@ class TestWithDouble(TestModulatedDeformableConvOp):
     def init_type(self):
         self.dtype = np.float64
 
+
 class TestFP16(unittest.TestCase):
 
     def check_main(self, input_np, offset_np, filter_np, dtype):
@@ -333,8 +334,8 @@ def test_main(self):
         offset = 10 * np.random.random(self.offset_size)
         filter = np.random.random(self.filter_size)
 
-        y_np_1, input_g_np_1, offset_g_np_1, filter_g_np_1 = self.check_main(input,offset,filter, 'float16')
-        y_np_2, input_g_np_2, offset_g_np_2, filter_g_np_2 = self.check_main(input,offset,filter, 'float32')
+        y_np_1, input_g_np_1, offset_g_np_1, filter_g_np_1 = self.check_main(input, offset, filter, 'float16')
+        y_np_2, input_g_np_2, offset_g_np_2, filter_g_np_2 = self.check_main(input, offset, filter, 'float32')
 
         def assert_equal(x, y):
             np.testing.assert_allclose(x, y, atol=3e-2)
@@ -344,6 +345,7 @@ def assert_equal(x, y):
         assert_equal(offset_g_np_1, offset_g_np_2)
         assert_equal(filter_g_np_1, filter_g_np_2)
 
+
 class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase):
 
     def test_error(self):

From 88192ccc2b5dc13ba60cbda156e40df503aad69d Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Sun, 9 Oct 2022 16:28:38 +0800
Subject: [PATCH 21/23] code style

---
 .../fluid/tests/unittests/test_deformable_conv_v1_op.py  | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
index 7e55b9e0673aa..982bc5a379981 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
@@ -303,7 +303,8 @@ def check_main(self, input_np, offset_np, filter_np, dtype):
         filter.stop_gradient = False
 
         y = paddle.vision.ops.deform_conv2d(input, offset, filter)
-        input_grad, offset_grad, filter_grad = paddle.grad(y, [input, offset, filter])
+        input_grad, offset_grad, filter_grad = paddle.grad(
+            y, [input, offset, filter])
         y_np = y.numpy().astype('float32')
         input_grad_np = input_grad.numpy().astype('float32')
         offset_grad_np = offset_grad.numpy().astype('float32')
@@ -334,8 +335,10 @@ def test_main(self):
         offset = 10 * np.random.random(self.offset_size)
         filter = np.random.random(self.filter_size)
 
-        y_np_1, input_g_np_1, offset_g_np_1, filter_g_np_1 = self.check_main(input, offset, filter, 'float16')
-        y_np_2, input_g_np_2, offset_g_np_2, filter_g_np_2 = self.check_main(input, offset, filter, 'float32')
+        y_np_1, input_g_np_1, offset_g_np_1, filter_g_np_1 = self.check_main(
+            input, offset, filter, 'float16')
+        y_np_2, input_g_np_2, offset_g_np_2, filter_g_np_2 = self.check_main(
+            input, offset, filter, 'float32')
 
         def assert_equal(x, y):
             np.testing.assert_allclose(x, y, atol=3e-2)

From c772392ee39bd5344075c3d85bfcb9d98b80f978 Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Sun, 9 Oct 2022 17:12:53 +0800
Subject: [PATCH 22/23] modify

---
 paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc | 2 +-
 paddle/phi/kernels/funcs/deformable_conv_functor.cc   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
index d38310360d0ae..050c61596fee5 100644
--- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
@@ -248,7 +248,7 @@ void ModulatedDeformableCol2imCoordCPUKernel(
       }
       cnt += 1;
     }
-    grad_offset[i] = static_cast<T>(val);
+    grad_offset[i] = val;
     if (grad_mask && offset_c % 2 == 0)
       grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
                       kernel_w +
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
index 226fa51bc335b..253a66adfc6a2 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
@@ -78,20 +78,20 @@ inline void ModulatedDeformableIm2colCPUKernel(
 
         const T offset_h = data_offset_ptr[data_offset_h_ptr];
         const T offset_w = data_offset_ptr[data_offset_w_ptr];
-        T val = 0;
+        T val = static_cast<T>(0);
         const T h_im = h_in + i * dilation_h + offset_h;
         const T w_im = w_in + j * dilation_w + offset_w;
         if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
           val = DmcnIm2colBilinear<T, T>(
               data_im_ptr, width, height, width, h_im, w_im);
         }
+        *data_col_ptr = val;
         if (data_mask_ptr) {
           const int data_mask_hw_ptr =
               ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
           const T mask = data_mask_ptr[data_mask_hw_ptr];
-          val *= mask;
+          *data_col_ptr *= mask;
         }
-        *data_col_ptr = static_cast<T>(val);
         data_col_ptr += batch_size * height_col * width_col;
       }
     }

From 82ce081a7f9077d8c6ca340a92bcedc5a2a76388 Mon Sep 17 00:00:00 2001
From: ray96 <lqr5259@outlook.com>
Date: Sun, 9 Oct 2022 22:32:14 +0800
Subject: [PATCH 23/23] modify

---
 .../impl/deformable_conv_grad_kernel_impl.h   | 50 ++++++-------------
 1 file changed, 14 insertions(+), 36 deletions(-)

diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index d68619728cd1f..7402a2273365b 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -302,42 +302,20 @@ void DeformableConvGradKernel(const Context& dev_ctx,
           mask_grad_data_ptr);
     }
     if (dx) {
-      if (x.dtype() == DataType::FLOAT16) {
-        DenseTensor mt_dx = phi::EmptyLike<MT, Context>(dev_ctx, *dx);
-        MT* mt_dx_ptr = dev_ctx.template Alloc<MT>(&mt_dx);
-
-        ModulatedDeformableCol2im(
-            dev_ctx,
-            col_buffer_ptr,
-            offset_ptr + i * im2col_step * input_offset_dim,
-            mask_data_ptr,
-            input_shape_vec,
-            col_buffer_shape_vec,
-            filter_shape_vec,
-            paddings,
-            strides,
-            dilations,
-            deformable_groups,
-            mt_dx_ptr + i * im2col_step * input_dim);
-
-        DenseTensor t_dx = phi::Cast<MT, Context>(dev_ctx, mt_dx, x.dtype());
-        dx->ShareDataWith(t_dx);
-      } else {
-        MT* mt_dx_ptr = dev_ctx.template Alloc<MT>(dx);
-        ModulatedDeformableCol2im(
-            dev_ctx,
-            col_buffer_ptr,
-            offset_ptr + i * im2col_step * input_offset_dim,
-            mask_data_ptr,
-            input_shape_vec,
-            col_buffer_shape_vec,
-            filter_shape_vec,
-            paddings,
-            strides,
-            dilations,
-            deformable_groups,
-            mt_dx_ptr + i * im2col_step * input_dim);
-      }
+      MT* mt_dx_ptr = dev_ctx.template Alloc<MT>(dx);
+
+      ModulatedDeformableCol2im(dev_ctx,
+                                col_buffer_ptr,
+                                offset_ptr + i * im2col_step * input_offset_dim,
+                                mask_data_ptr,
+                                input_shape_vec,
+                                col_buffer_shape_vec,
+                                filter_shape_vec,
+                                paddings,
+                                strides,
+                                dilations,
+                                deformable_groups,
+                                mt_dx_ptr + i * im2col_step * input_dim);
       dx->Resize(x.dims());
     }