From 6b510559a1ea0034965eb16c87f263eeb64e66b4 Mon Sep 17 00:00:00 2001 From: ray96 Date: Fri, 16 Sep 2022 11:32:24 +0800 Subject: [PATCH 01/23] add support fp16 for deformable --- .../cpu/deformable_conv_grad_kernel.cc | 231 ++++++++++-------- .../kernels/funcs/deformable_conv_functor.cc | 25 +- .../kernels/funcs/deformable_conv_functor.cu | 43 +++- .../kernels/funcs/deformable_conv_functor.h | 55 +++-- .../gpu/deformable_conv_grad_kernel.cu | 84 ++++--- .../phi/kernels/gpu/deformable_conv_kernel.cu | 3 +- .../impl/deformable_conv_grad_kernel_impl.h | 51 ++-- .../impl/deformable_conv_kernel_impl.h | 2 + .../unittests/test_deformable_conv_v1_op.py | 27 +- 9 files changed, 309 insertions(+), 212 deletions(-) diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc index a4d43ef8fbe89..093da5fe026b2 100644 --- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc @@ -14,15 +14,24 @@ #include "paddle/phi/kernels/deformable_conv_grad_kernel.h" -#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h" namespace phi { +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaximumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaximumNumBlocks); +} + template -inline void ModulatedDeformableCol2imCPUKernel( - const int num_kernels, +__global__ void ModulatedDeformableCol2imGpuKernel( + const int nthreads, const T* data_col, const T* data_offset, const T* data_mask, @@ -43,7 +52,10 @@ inline void ModulatedDeformableCol2imCPUKernel( const int height_col, const int width_col, T* grad_im) { - for (int thread = 0; thread < num_kernels; thread++) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + using MT = typename phi::dtype::MPTypeTrait::Type; + for (size_t thread = index; thread < nthreads; thread += offset) { const int j = (thread / width_col / height_col / batch_size) % kernel_w; const int i = (thread / width_col / height_col / batch_size / kernel_w) % kernel_h; @@ -67,17 +79,17 @@ inline void ModulatedDeformableCol2imCPUKernel( ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; - const T offset_h = data_offset_ptr[data_offset_h_ptr]; - const T offset_w = data_offset_ptr[data_offset_w_ptr]; - const T cur_inv_h_data = h_in + i * dilation_h + offset_h; - const T cur_inv_w_data = w_in + j * dilation_w + offset_w; + const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); + const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); + const MT cur_inv_h_data = h_in + i * dilation_h + offset_h; + const MT cur_inv_w_data = w_in + j * dilation_w + offset_w; - T cur_top_grad = data_col[thread]; + MT cur_top_grad = static_cast(data_col[thread]); if (data_mask) { const T* data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; - const T mask = data_mask_ptr[data_mask_hw_ptr]; + const MT mask = static_cast(data_mask_ptr[data_mask_hw_ptr]); cur_top_grad *= mask; } const int cur_h = static_cast(cur_inv_h_data); @@ -89,15 +101,16 @@ inline void ModulatedDeformableCol2imCPUKernel( abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; - T weight = DmcnGetGradientWeight(cur_inv_h_data, - cur_inv_w_data, - cur_h + dy, - cur_w + dx, - height, - width); + MT weight = DmcnGetGradientWeight(cur_inv_h_data, + cur_inv_w_data, + cur_h + dy, + cur_w + dx, + height, + width); - *(grad_im + cur_bottom_grad_pos) = - *(grad_im + cur_bottom_grad_pos) + weight * cur_top_grad; + paddle::platform::CudaAtomicAdd( + grad_im + cur_bottom_grad_pos, + static_cast(weight * cur_top_grad)); } } } @@ -119,33 +132,36 @@ void ModulatedDeformableCol2im(const Context& dev_ctx, T* grad_im) { int channel_per_deformable_group = im_shape[0] / deformable_group; int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; + int blocks = NumBlocks(num_kernels); + int threads = kNumCUDAThreads; - ModulatedDeformableCol2imCPUKernel(num_kernels, - data_col, - data_offset, - data_mask, - im_shape[0], - im_shape[1], - im_shape[2], - kernel_shape[2], - kernel_shape[3], - pad[0], - pad[1], - stride[0], - stride[1], - dilation[0], - dilation[1], - channel_per_deformable_group, - col_shape[1], - deformable_group, - col_shape[2], - col_shape[3], - grad_im); + ModulatedDeformableCol2imGpuKernel + <<>>(num_kernels, + data_col, + data_offset, + data_mask, + im_shape[0], + im_shape[1], + im_shape[2], + kernel_shape[2], + kernel_shape[3], + pad[0], + pad[1], + stride[0], + stride[1], + dilation[0], + dilation[1], + channel_per_deformable_group, + col_shape[1], + deformable_group, + col_shape[2], + col_shape[3], + grad_im); } template -void ModulatedDeformableCol2imCoordCPUKernel( - const int num_kernels, +__global__ void ModulatedDeformableCol2imCoordGpuKernel( + const int nthreads, const T* data_col, const T* data_im, const T* data_offset, @@ -169,8 +185,11 @@ void ModulatedDeformableCol2imCoordCPUKernel( const int width_col, T* grad_offset, T* grad_mask) { - for (int i = 0; i < num_kernels; i++) { - T val = 0, mval = 0; + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + using MT = typename phi::dtype::MPTypeTrait::Type; + for (size_t i = index; i < nthreads; i += offset) { + MT val = 0, mval = 0; const int w = i % width_col; const int h = (i / width_col) % height_col; const int c = (i / width_col / height_col) % offset_channels; @@ -215,40 +234,42 @@ void ModulatedDeformableCol2imCoordCPUKernel( const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); - const T offset_h = data_offset_ptr[data_offset_h_ptr]; - const T offset_w = data_offset_ptr[data_offset_w_ptr]; - T inv_h = h_in + i * dilation_h + offset_h; - T inv_w = w_in + j * dilation_w + offset_w; + + const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); + const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); + MT inv_h = h_in + i * dilation_h + offset_h; + MT inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { - mval += data_col_ptr[col_pos] * - funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, - width, - height, - width, - inv_h, - inv_w); + mval += + static_cast(data_col_ptr[col_pos]) * + funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, + width, + height, + width, + inv_h, + inv_w); } - const T weight = - DmcnGetCoordinateWeight(inv_h, - inv_w, - height, - width, - data_im_ptr + cnt * height * width, - width, - bp_dir); + const MT weight = + DmcnGetCoordinateWeight(inv_h, + inv_w, + height, + width, + data_im_ptr + cnt * height * width, + width, + bp_dir); if (data_mask_ptr) { const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); - const T mask = data_mask_ptr[data_mask_hw_ptr]; - val += weight * data_col_ptr[col_pos] * mask; + const MT mask = static_cast(data_mask_ptr[data_mask_hw_ptr]); + val += weight * static_cast(data_col_ptr[col_pos]) * mask; } else { - val += weight * data_col_ptr[col_pos]; + val += weight * static_cast(data_col_ptr[col_pos]); } cnt += 1; } - grad_offset[i] = val; + grad_offset[i] = static_cast(val); if (grad_mask && offset_c % 2 == 0) grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + @@ -278,32 +299,49 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx, int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] * col_shape[2] * col_shape[3] * deformable_groups; int channel_per_deformable_group = col_shape[0] / deformable_groups; + int blocks = NumBlocks(num_kernels); + int threads = kNumCUDAThreads; - ModulatedDeformableCol2imCoordCPUKernel( - num_kernels, - data_col, - data_im, - data_offset, - data_mask, - im_shape[0], - im_shape[1], - im_shape[2], - kernel_shape[2], - kernel_shape[3], - paddings[0], - paddings[1], - strides[0], - strides[1], - dilations[0], - dilations[1], - channel_per_deformable_group, - col_shape[1], - 2 * kernel_shape[2] * kernel_shape[3] * deformable_groups, - deformable_groups, - col_shape[2], - col_shape[3], - grad_offset, - grad_mask); + ModulatedDeformableCol2imCoordGpuKernel + <<>>( + num_kernels, + data_col, + data_im, + data_offset, + data_mask, + im_shape[0], + im_shape[1], + im_shape[2], + kernel_shape[2], + kernel_shape[3], + paddings[0], + paddings[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + channel_per_deformable_group, + col_shape[1], + 2 * kernel_shape[2] * kernel_shape[3] * deformable_groups, + deformable_groups, + col_shape[2], + col_shape[3], + grad_offset, + grad_mask); +} + +template +__global__ void FilterGradAddupGpuKernel(const int nthreads, + const int n, + const int height, + const int width, + const T* dweight_3d, + T* filter_grad) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + filter_grad[i] = filter_grad[i] + dweight_3d[i]; + } } template @@ -314,16 +352,17 @@ void FilterGradAddup(const Context& dev_ctx, const int width, const T* dweight_3d, T* filter_grad) { - for (int i = 0; i < nthreads; i++) { - filter_grad[i] = filter_grad[i] + dweight_3d[i]; - } + FilterGradAddupGpuKernel + <<>>( + nthreads, n, height, width, dweight_3d, filter_grad); } } // namespace phi PD_REGISTER_KERNEL(deformable_conv_grad, - CPU, + GPU, ALL_LAYOUT, phi::DeformableConvGradKernel, float, - double) {} + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc index 48858fa59390e..f6483b9cdd2e8 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" - +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/float16.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" namespace phi { namespace funcs { @@ -42,6 +44,7 @@ inline void ModulatedDeformableIm2colCPUKernel( const int height_col, const int width_col, T* data_col) { + using MT = typename phi::dtype::MPTypeTrait::Type; for (int i = 0; i < num_kernels; i++) { const int w_col = i % width_col; const int h_col = (i / width_col) % height_col; @@ -76,22 +79,22 @@ inline void ModulatedDeformableIm2colCPUKernel( ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; - const T offset_h = data_offset_ptr[data_offset_h_ptr]; - const T offset_w = data_offset_ptr[data_offset_w_ptr]; - T val = static_cast(0); - const T h_im = h_in + i * dilation_h + offset_h; - const T w_im = w_in + j * dilation_w + offset_w; + const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); + const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); + MT val = static_cast(0); + const MT h_im = h_in + i * dilation_h + offset_h; + const MT w_im = w_in + j * dilation_w + offset_w; if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { - val = - DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); + val = DmcnIm2colBilinear( + data_im_ptr, width, height, width, h_im, w_im); } - *data_col_ptr = val; if (data_mask_ptr) { const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; - const T mask = data_mask_ptr[data_mask_hw_ptr]; - *data_col_ptr *= mask; + const MT mask = static_cast(data_mask_ptr[data_mask_hw_ptr]); + val *= mask; } + *data_col_ptr = static_cast(val); data_col_ptr += batch_size * height_col * width_col; } } diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu index bebea5dcb74ca..7ae2526406f2b 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu @@ -12,8 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/float16.h" namespace phi { namespace funcs { @@ -51,6 +54,8 @@ __global__ void ModulatedDeformableIm2colGpuKernel( T* data_col) { int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; + + using MT = typename phi::dtype::MPTypeTrait::Type; for (size_t i = index; i < nthreads; i += offset) { const int w_col = i % width_col; const int h_col = (i / width_col) % height_col; @@ -77,6 +82,8 @@ __global__ void ModulatedDeformableIm2colGpuKernel( kernel_h * kernel_w * height_col * width_col : nullptr; + // 上边都动不了,指针移动也不会引起误差和速度降低 + for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { const int data_offset_h_ptr = @@ -85,22 +92,22 @@ __global__ void ModulatedDeformableIm2colGpuKernel( ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; - const T offset_h = data_offset_ptr[data_offset_h_ptr]; - const T offset_w = data_offset_ptr[data_offset_w_ptr]; - T val = static_cast(0); - const T h_im = h_in + i * dilation_h + offset_h; - const T w_im = w_in + j * dilation_w + offset_w; + const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); + const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); + MT val = static_cast(0); + const MT h_im = h_in + i * dilation_h + offset_h; + const MT w_im = w_in + j * dilation_w + offset_w; if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { - val = - DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); + val = DmcnIm2colBilinear( + data_im_ptr, width, height, width, h_im, w_im); } - *data_col_ptr = val; if (data_mask_ptr) { const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; - const T mask = data_mask_ptr[data_mask_hw_ptr]; - *data_col_ptr *= mask; + const MT mask = static_cast(data_mask_ptr[data_mask_hw_ptr]); + val *= mask; } + *data_col_ptr = static_cast(val); data_col_ptr += batch_size * height_col * width_col; } } @@ -164,6 +171,20 @@ template void ModulatedDeformableIm2col( const int deformable_groups, float* data_col); +template void ModulatedDeformableIm2col( + const phi::GPUContext& dev_ctx, + const phi::dtype::float16* data_im, + const phi::dtype::float16* data_offset, + const phi::dtype::float16* data_mask, + const std::vector& im_shape, + const std::vector& col_shape, + const std::vector& filter_shape, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const int deformable_groups, + phi::dtype::float16* data_col); + template void ModulatedDeformableIm2col( const phi::GPUContext& dev_ctx, const double* data_im, diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.h b/paddle/phi/kernels/funcs/deformable_conv_functor.h index eecda72927510..62e42cd58334f 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.h +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.h @@ -14,44 +14,47 @@ #pragma once +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/float16.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { namespace funcs { -template -HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data, - const int data_width, - const int height, - const int width, - T h, - T w) { +template +HOSTDEVICE MT DmcnIm2colBilinear(const T* bottom_data, + const int data_width, + const int height, + const int width, + MT h, + MT w) { int h_low = floor(h); int w_low = floor(w); int h_high = h_low + 1; int w_high = w_low + 1; - T lh = h - h_low; - T lw = w - w_low; - T hh = 1 - lh; - T hw = 1 - lw; + MT lh = h - h_low; + MT lw = w - w_low; + MT hh = 1 - lh; + MT hw = 1 - lw; - T v1 = - (h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0; - T v2 = (h_low >= 0 && w_high <= width - 1) - ? bottom_data[h_low * data_width + w_high] - : 0; - T v3 = (h_high <= height - 1 && w_low >= 0) - ? bottom_data[h_high * data_width + w_low] - : 0; - T v4 = (h_high <= height - 1 && w_high <= width - 1) - ? bottom_data[h_high * data_width + w_high] - : 0; + MT v1 = (h_low >= 0 && w_low >= 0) + ? static_cast(bottom_data[h_low * data_width + w_low]) + : 0; + MT v2 = (h_low >= 0 && w_high <= width - 1) + ? static_cast(bottom_data[h_low * data_width + w_high]) + : 0; + MT v3 = (h_high <= height - 1 && w_low >= 0) + ? static_cast(bottom_data[h_high * data_width + w_low]) + : 0; + MT v4 = (h_high <= height - 1 && w_high <= width - 1) + ? static_cast(bottom_data[h_high * data_width + w_high]) + : 0; - T w1 = hh * hw; - T w2 = hh * lw; - T w3 = lh * hw; - T w4 = lh * lw; + MT w1 = hh * hw; + MT w2 = hh * lw; + MT w3 = lh * hw; + MT w4 = lh * lw; return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; } diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu index b46f1f4a3314d..093da5fe026b2 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu @@ -54,6 +54,7 @@ __global__ void ModulatedDeformableCol2imGpuKernel( T* grad_im) { int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; + using MT = typename phi::dtype::MPTypeTrait::Type; for (size_t thread = index; thread < nthreads; thread += offset) { const int j = (thread / width_col / height_col / batch_size) % kernel_w; const int i = @@ -78,17 +79,17 @@ __global__ void ModulatedDeformableCol2imGpuKernel( ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; - const T offset_h = data_offset_ptr[data_offset_h_ptr]; - const T offset_w = data_offset_ptr[data_offset_w_ptr]; - const T cur_inv_h_data = h_in + i * dilation_h + offset_h; - const T cur_inv_w_data = w_in + j * dilation_w + offset_w; + const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); + const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); + const MT cur_inv_h_data = h_in + i * dilation_h + offset_h; + const MT cur_inv_w_data = w_in + j * dilation_w + offset_w; - T cur_top_grad = data_col[thread]; + MT cur_top_grad = static_cast(data_col[thread]); if (data_mask) { const T* data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; - const T mask = data_mask_ptr[data_mask_hw_ptr]; + const MT mask = static_cast(data_mask_ptr[data_mask_hw_ptr]); cur_top_grad *= mask; } const int cur_h = static_cast(cur_inv_h_data); @@ -100,15 +101,16 @@ __global__ void ModulatedDeformableCol2imGpuKernel( abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; - T weight = DmcnGetGradientWeight(cur_inv_h_data, - cur_inv_w_data, - cur_h + dy, - cur_w + dx, - height, - width); + MT weight = DmcnGetGradientWeight(cur_inv_h_data, + cur_inv_w_data, + cur_h + dy, + cur_w + dx, + height, + width); - paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos, - weight * cur_top_grad); + paddle::platform::CudaAtomicAdd( + grad_im + cur_bottom_grad_pos, + static_cast(weight * cur_top_grad)); } } } @@ -185,8 +187,9 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel( T* grad_mask) { int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; + using MT = typename phi::dtype::MPTypeTrait::Type; for (size_t i = index; i < nthreads; i += offset) { - T val = 0, mval = 0; + MT val = 0, mval = 0; const int w = i % width_col; const int h = (i / width_col) % height_col; const int c = (i / width_col / height_col) % offset_channels; @@ -231,40 +234,42 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel( const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); - const T offset_h = data_offset_ptr[data_offset_h_ptr]; - const T offset_w = data_offset_ptr[data_offset_w_ptr]; - T inv_h = h_in + i * dilation_h + offset_h; - T inv_w = w_in + j * dilation_w + offset_w; + + const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); + const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); + MT inv_h = h_in + i * dilation_h + offset_h; + MT inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { - mval += data_col_ptr[col_pos] * - funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, - width, - height, - width, - inv_h, - inv_w); + mval += + static_cast(data_col_ptr[col_pos]) * + funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, + width, + height, + width, + inv_h, + inv_w); } - const T weight = - DmcnGetCoordinateWeight(inv_h, - inv_w, - height, - width, - data_im_ptr + cnt * height * width, - width, - bp_dir); + const MT weight = + DmcnGetCoordinateWeight(inv_h, + inv_w, + height, + width, + data_im_ptr + cnt * height * width, + width, + bp_dir); if (data_mask_ptr) { const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); - const T mask = data_mask_ptr[data_mask_hw_ptr]; - val += weight * data_col_ptr[col_pos] * mask; + const MT mask = static_cast(data_mask_ptr[data_mask_hw_ptr]); + val += weight * static_cast(data_col_ptr[col_pos]) * mask; } else { - val += weight * data_col_ptr[col_pos]; + val += weight * static_cast(data_col_ptr[col_pos]); } cnt += 1; } - grad_offset[i] = val; + grad_offset[i] = static_cast(val); if (grad_mask && offset_c % 2 == 0) grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + @@ -359,4 +364,5 @@ PD_REGISTER_KERNEL(deformable_conv_grad, ALL_LAYOUT, phi::DeformableConvGradKernel, float, - double) {} + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu index 2476dcbafb984..021791ca93061 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu @@ -23,4 +23,5 @@ PD_REGISTER_KERNEL(deformable_conv, ALL_LAYOUT, phi::DeformableConvKernel, float, - double) {} + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h index 744c48b2bfbd6..90a5243fc72d4 100644 --- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/empty_kernel.h" @@ -58,14 +59,14 @@ HOSTDEVICE T DmcnGetGradientWeight(T argmax_h, return weight; } -template -HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h, - T argmax_w, - const int height, - const int width, - const T* im_data, - const int data_width, - const int bp_dir) { +template +HOSTDEVICE MT DmcnGetCoordinateWeight(MT argmax_h, + MT argmax_w, + const int height, + const int width, + const T* im_data, + const int data_width, + const int bp_dir) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { return 0; @@ -76,43 +77,51 @@ HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h, int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; - T weight = 0; + MT weight = 0; if (bp_dir == 0) { weight += (argmax_h_low >= 0 && argmax_w_low >= 0) ? -1 * (argmax_w_low + 1 - argmax_w) * - im_data[argmax_h_low * data_width + argmax_w_low] + static_cast( + im_data[argmax_h_low * data_width + argmax_w_low]) : 0; weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1) ? -1 * (argmax_w - argmax_w_low) * - im_data[argmax_h_low * data_width + argmax_w_high] + static_cast( + im_data[argmax_h_low * data_width + argmax_w_high]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0) ? (argmax_w_low + 1 - argmax_w) * - im_data[argmax_h_high * data_width + argmax_w_low] + static_cast( + im_data[argmax_h_high * data_width + argmax_w_low]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) ? (argmax_w - argmax_w_low) * - im_data[argmax_h_high * data_width + argmax_w_high] + static_cast( + im_data[argmax_h_high * data_width + argmax_w_high]) : 0; } else if (bp_dir == 1) { weight += (argmax_h_low >= 0 && argmax_w_low >= 0) ? -1 * (argmax_h_low + 1 - argmax_h) * - im_data[argmax_h_low * data_width + argmax_w_low] + static_cast( + im_data[argmax_h_low * data_width + argmax_w_low]) : 0; weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1) ? (argmax_h_low + 1 - argmax_h) * - im_data[argmax_h_low * data_width + argmax_w_high] + static_cast( + im_data[argmax_h_low * data_width + argmax_w_high]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0) ? -1 * (argmax_h - argmax_h_low) * - im_data[argmax_h_high * data_width + argmax_w_low] + static_cast( + im_data[argmax_h_high * data_width + argmax_w_low]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) ? (argmax_h - argmax_h_low) * - im_data[argmax_h_high * data_width + argmax_w_high] + static_cast( + im_data[argmax_h_high * data_width + argmax_w_high]) : 0; } @@ -290,7 +299,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, deformable_groups, offset_grad_ptr + i * im2col_step * input_offset_dim, mask_grad_data_ptr); - } + } // check if (dx) { T* dx_ptr = dx->data(); // get grad of input @@ -305,7 +314,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, strides, dilations, deformable_groups, - dx_ptr + i * im2col_step * input_dim); + dx_ptr + i * im2col_step * input_dim); //待改 dx->Resize(x.dims()); } @@ -321,7 +330,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, strides, dilations, deformable_groups, - col_buffer_ptr); + col_buffer_ptr); // check col_buffer_3d.Resize(col_buffer_3d_shape); @@ -353,7 +362,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, K, M, dweight_3d.data(), - filter_grad->data()); + filter_grad->data()); // 待改 } } if (filter_grad) { diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h index f864c2e5f0ed0..6bda245f3bda9 100644 --- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h +++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/empty_kernel.h" @@ -38,6 +39,7 @@ void DeformableConvKernel(const Context& dev_ctx, DenseTensor* out) { const int batch_size = static_cast(x.dims()[0]); + using MT = typename phi::dtype::MPTypeTrait::Type; std::vector filter_shape_vec(phi::vectorize(filter.dims())); std::vector output_shape_vec(phi::vectorize(out->dims())); diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py index a60881e8dded2..57afd9f45aa27 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py @@ -23,6 +23,9 @@ from paddle.fluid.framework import _test_eager_guard +paddle.enable_static() + + def dmc_bilinear(data_im, height, width, h, w): h_low = int(np.floor(h)) w_low = int(np.floor(w)) @@ -61,8 +64,8 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param): assert f_c * group == in_c assert np.mod(out_c, group) == 0 - stride, pad, dilation = conv_param['stride'], conv_param['pad'],\ - conv_param['dilation'] + stride, pad, dilation = conv_param['stride'], conv_param['pad'], \ + conv_param['dilation'] out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0] out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1] assert out_h == in_h @@ -76,18 +79,18 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param): for kh in range(f_h): for kw in range(f_w): offset_h_table = \ - offset[n, ::2, h, w].reshape(f_h, f_w) + offset[n, ::2, h, w].reshape(f_h, f_w) offset_w_table = \ - offset[n, 1::2, h, w].reshape(f_h, f_w) + offset[n, 1::2, h, w].reshape(f_h, f_w) offset_h = offset_h_table[kh, kw] offset_w = offset_w_table[kh, kw] val = 0 im_h = h * stride[0] + kh * dilation[0] \ - + offset_h - pad[0] + + offset_h - pad[0] im_w = w * stride[0] + kw * dilation[0] \ - + offset_w - pad[1] + + offset_w - pad[1] if im_h > -1 and im_w > -1 and \ - im_h < in_h and im_w < in_h: + im_h < in_h and im_w < in_h: val = dmc_bilinear(input[n, c], in_h, in_w, im_h, im_w) val_out = val @@ -201,6 +204,7 @@ def init_type(self): self.dtype = np.float32 + class TestWithStride(TestModulatedDeformableConvOp): def init_test_case(self): @@ -286,6 +290,15 @@ def init_type(self): self.dtype = np.float64 +class TestWithFloat16(TestModulatedDeformableConvOp): + + def init_type(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output(check_eager=True, atol=1e-3) + + class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase): def test_error(self): From 3fa8f48afc683798debbb7e70c0d9e73a75cc6f5 Mon Sep 17 00:00:00 2001 From: ray96 Date: Fri, 16 Sep 2022 11:35:04 +0800 Subject: [PATCH 02/23] fix error push --- .../cpu/deformable_conv_grad_kernel.cc | 172 +++++++----------- 1 file changed, 68 insertions(+), 104 deletions(-) diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc index 093da5fe026b2..f6ff5d63372a0 100644 --- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc @@ -14,24 +14,15 @@ #include "paddle/phi/kernels/deformable_conv_grad_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h" namespace phi { -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaximumNumBlocks = 4096; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaximumNumBlocks); -} - template -__global__ void ModulatedDeformableCol2imGpuKernel( - const int nthreads, +inline void ModulatedDeformableCol2imCPUKernel( + const int num_kernels, const T* data_col, const T* data_offset, const T* data_mask, @@ -52,10 +43,9 @@ __global__ void ModulatedDeformableCol2imGpuKernel( const int height_col, const int width_col, T* grad_im) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; + using MT = typename phi::dtype::MPTypeTrait::Type; - for (size_t thread = index; thread < nthreads; thread += offset) { + for (int thread = 0; thread < num_kernels; thread++) { const int j = (thread / width_col / height_col / batch_size) % kernel_w; const int i = (thread / width_col / height_col / batch_size / kernel_w) % kernel_h; @@ -108,9 +98,8 @@ __global__ void ModulatedDeformableCol2imGpuKernel( height, width); - paddle::platform::CudaAtomicAdd( - grad_im + cur_bottom_grad_pos, - static_cast(weight * cur_top_grad)); + *(grad_im + cur_bottom_grad_pos) = + *(grad_im + cur_bottom_grad_pos) + static_cast(weight * cur_top_grad); } } } @@ -132,36 +121,33 @@ void ModulatedDeformableCol2im(const Context& dev_ctx, T* grad_im) { int channel_per_deformable_group = im_shape[0] / deformable_group; int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; - int blocks = NumBlocks(num_kernels); - int threads = kNumCUDAThreads; - ModulatedDeformableCol2imGpuKernel - <<>>(num_kernels, - data_col, - data_offset, - data_mask, - im_shape[0], - im_shape[1], - im_shape[2], - kernel_shape[2], - kernel_shape[3], - pad[0], - pad[1], - stride[0], - stride[1], - dilation[0], - dilation[1], - channel_per_deformable_group, - col_shape[1], - deformable_group, - col_shape[2], - col_shape[3], - grad_im); + ModulatedDeformableCol2imCPUKernel(num_kernels, + data_col, + data_offset, + data_mask, + im_shape[0], + im_shape[1], + im_shape[2], + kernel_shape[2], + kernel_shape[3], + pad[0], + pad[1], + stride[0], + stride[1], + dilation[0], + dilation[1], + channel_per_deformable_group, + col_shape[1], + deformable_group, + col_shape[2], + col_shape[3], + grad_im); } template -__global__ void ModulatedDeformableCol2imCoordGpuKernel( - const int nthreads, +void ModulatedDeformableCol2imCoordCPUKernel( + const int num_kernels, const T* data_col, const T* data_im, const T* data_offset, @@ -185,10 +171,8 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel( const int width_col, T* grad_offset, T* grad_mask) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; using MT = typename phi::dtype::MPTypeTrait::Type; - for (size_t i = index; i < nthreads; i += offset) { + for (int i = 0; i < num_kernels; i++) { MT val = 0, mval = 0; const int w = i % width_col; const int h = (i / width_col) % height_col; @@ -234,7 +218,6 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel( const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); - const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); MT inv_h = h_in + i * dilation_h + offset_h; @@ -242,14 +225,13 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel( if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { - mval += - static_cast(data_col_ptr[col_pos]) * - funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, - width, - height, - width, - inv_h, - inv_w); + mval += static_cast(data_col_ptr[col_pos]) * + funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, + width, + height, + width, + inv_h, + inv_w); } const MT weight = DmcnGetCoordinateWeight(inv_h, @@ -299,49 +281,32 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx, int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] * col_shape[2] * col_shape[3] * deformable_groups; int channel_per_deformable_group = col_shape[0] / deformable_groups; - int blocks = NumBlocks(num_kernels); - int threads = kNumCUDAThreads; - - ModulatedDeformableCol2imCoordGpuKernel - <<>>( - num_kernels, - data_col, - data_im, - data_offset, - data_mask, - im_shape[0], - im_shape[1], - im_shape[2], - kernel_shape[2], - kernel_shape[3], - paddings[0], - paddings[1], - strides[0], - strides[1], - dilations[0], - dilations[1], - channel_per_deformable_group, - col_shape[1], - 2 * kernel_shape[2] * kernel_shape[3] * deformable_groups, - deformable_groups, - col_shape[2], - col_shape[3], - grad_offset, - grad_mask); -} -template -__global__ void FilterGradAddupGpuKernel(const int nthreads, - const int n, - const int height, - const int width, - const T* dweight_3d, - T* filter_grad) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (size_t i = index; i < nthreads; i += offset) { - filter_grad[i] = filter_grad[i] + dweight_3d[i]; - } + ModulatedDeformableCol2imCoordCPUKernel( + num_kernels, + data_col, + data_im, + data_offset, + data_mask, + im_shape[0], + im_shape[1], + im_shape[2], + kernel_shape[2], + kernel_shape[3], + paddings[0], + paddings[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + channel_per_deformable_group, + col_shape[1], + 2 * kernel_shape[2] * kernel_shape[3] * deformable_groups, + deformable_groups, + col_shape[2], + col_shape[3], + grad_offset, + grad_mask); } template @@ -352,17 +317,16 @@ void FilterGradAddup(const Context& dev_ctx, const int width, const T* dweight_3d, T* filter_grad) { - FilterGradAddupGpuKernel - <<>>( - nthreads, n, height, width, dweight_3d, filter_grad); + for (int i = 0; i < nthreads; i++) { + filter_grad[i] = filter_grad[i] + dweight_3d[i]; + } } } // namespace phi PD_REGISTER_KERNEL(deformable_conv_grad, - GPU, + CPU, ALL_LAYOUT, phi::DeformableConvGradKernel, float, - double, - phi::dtype::float16) {} + double) {} From f6c8424428108ab4aeed6124aeaf8ee794800385 Mon Sep 17 00:00:00 2001 From: ray96 Date: Fri, 16 Sep 2022 14:09:44 +0800 Subject: [PATCH 03/23] fix slow part --- paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu index 093da5fe026b2..0704234694d9c 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu @@ -108,9 +108,9 @@ __global__ void ModulatedDeformableCol2imGpuKernel( height, width); - paddle::platform::CudaAtomicAdd( - grad_im + cur_bottom_grad_pos, - static_cast(weight * cur_top_grad)); +// 慢在这 ,不支持float16 + *(grad_im + cur_bottom_grad_pos) = + *(grad_im + cur_bottom_grad_pos) + static_cast(weight * cur_top_grad); } } } From 62990e7888a1c106f934392ea7404d0fa40cb667 Mon Sep 17 00:00:00 2001 From: ray96 Date: Fri, 16 Sep 2022 20:30:49 +0800 Subject: [PATCH 04/23] set datetype --- paddle/phi/kernels/gpu/deformable_conv_kernel.cu | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu index 021791ca93061..31348fbb2c28d 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu @@ -24,4 +24,9 @@ PD_REGISTER_KERNEL(deformable_conv, phi::DeformableConvKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32); + } +} From 2820e9b3f7ae103a4ddf18bcd5936aa3117d690c Mon Sep 17 00:00:00 2001 From: ray96 Date: Mon, 19 Sep 2022 11:10:44 +0800 Subject: [PATCH 05/23] fix im2col_step bug and codestyle --- .../cpu/deformable_conv_grad_kernel.cc | 19 +++--- .../gpu/deformable_conv_grad_kernel.cu | 5 +- .../impl/deformable_conv_kernel_impl.h | 68 +++++++++++++++---- .../unittests/test_deformable_conv_v1_op.py | 3 - 4 files changed, 69 insertions(+), 26 deletions(-) diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc index f6ff5d63372a0..962576154a7bf 100644 --- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc @@ -43,7 +43,6 @@ inline void ModulatedDeformableCol2imCPUKernel( const int height_col, const int width_col, T* grad_im) { - using MT = typename phi::dtype::MPTypeTrait::Type; for (int thread = 0; thread < num_kernels; thread++) { const int j = (thread / width_col / height_col / batch_size) % kernel_w; @@ -99,7 +98,8 @@ inline void ModulatedDeformableCol2imCPUKernel( width); *(grad_im + cur_bottom_grad_pos) = - *(grad_im + cur_bottom_grad_pos) + static_cast(weight * cur_top_grad); + *(grad_im + cur_bottom_grad_pos) + + static_cast(weight * cur_top_grad); } } } @@ -225,13 +225,14 @@ void ModulatedDeformableCol2imCoordCPUKernel( if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { - mval += static_cast(data_col_ptr[col_pos]) * - funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, - width, - height, - width, - inv_h, - inv_w); + mval += + static_cast(data_col_ptr[col_pos]) * + funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, + width, + height, + width, + inv_h, + inv_w); } const MT weight = DmcnGetCoordinateWeight(inv_h, diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu index 0704234694d9c..a87da6102f44a 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu @@ -108,9 +108,10 @@ __global__ void ModulatedDeformableCol2imGpuKernel( height, width); -// 慢在这 ,不支持float16 + // 慢在这 ,不支持float16 *(grad_im + cur_bottom_grad_pos) = - *(grad_im + cur_bottom_grad_pos) + static_cast(weight * cur_top_grad); + *(grad_im + cur_bottom_grad_pos) + + static_cast(weight * cur_top_grad); } } } diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h index 6bda245f3bda9..450d7a48e2bfb 100644 --- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h +++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h @@ -20,6 +20,7 @@ #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" +#include "paddle/phi/kernels/transpose_kernel.h" #include "paddle/utils/optional.h" namespace phi { @@ -39,6 +40,11 @@ void DeformableConvKernel(const Context& dev_ctx, DenseTensor* out) { const int batch_size = static_cast(x.dims()[0]); + int temp_step = std::min(64, batch_size); + if (batch_size % temp_step == 0) { + im2col_step = temp_step; + } + using MT = typename phi::dtype::MPTypeTrait::Type; std::vector filter_shape_vec(phi::vectorize(filter.dims())); std::vector output_shape_vec(phi::vectorize(out->dims())); @@ -58,8 +64,9 @@ void DeformableConvKernel(const Context& dev_ctx, DenseTensor col_buffer = Empty(dev_ctx, col_buffer_shape_vec); DenseTensor output_buffer = Empty(dev_ctx, output_buffer_shape_vec); - int64_t M = output_shape_vec[1] / groups; - int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; + int64_t M = output_shape_vec[1] / groups; // 4 : C + int64_t N = im2col_step * output_shape_vec[2] * + output_shape_vec[3]; // 2*3*3 :im2Step * H * W int64_t K = x.dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; DenseTensor weight_3d; @@ -69,11 +76,18 @@ void DeformableConvKernel(const Context& dev_ctx, col_buffer_3d.ShareDataWith(col_buffer) .Resize(phi::make_ddim({groups, K, N})); - DenseTensor output_4d; + DenseTensor output_4d; //计算用 这里需要分配个空内存 output_4d.ShareDataWith(output_buffer) - .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N})); - - DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size()); + .Resize( + phi::make_ddim({batch_size / im2col_step, + groups, + M, + N})); // 3 * 1 * 4 * (2*3*3) : mini_batch * group * + // C/group * (im2stap * H * W) 3 * 1 * 4 * + // (2*3*3) : mini_batch * C * (im2stap * H * W) + + DDim input_shape = + phi::slice_ddim(x.dims(), 1, x.dims().size()); //单张图片大小:C*H*W std::vector input_shape_vec = phi::vectorize(input_shape); int input_dim = x.numel() / x.dims()[0]; @@ -103,17 +117,25 @@ void DeformableConvKernel(const Context& dev_ctx, dilations, deformable_groups, col_buffer_ptr); - DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize( - phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); + DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize(phi::slice_ddim( + output_4d.dims(), + 1, + output_4d.dims().size())); // group * C/group * (im2step * H * W) + // get the product of pixel and weight for (int g = 0; g < groups; ++g) { DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size())); + phi::slice_ddim(weight_3d.dims(), + 1, + weight_3d.dims().size())); //等于是把第0维去掉 DenseTensor col_buffer_3d_slice = col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim( col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); - DenseTensor output_3d_slice = output_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size())); + DenseTensor output_3d_slice = + output_3d.Slice(g, g + 1).Resize(phi::slice_ddim( + output_3d.dims(), + 1, + output_3d.dims().size())); // 4*32:C * ((im2col_step)*H*W)) blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, @@ -123,7 +145,29 @@ void DeformableConvKernel(const Context& dev_ctx, T(0.0)); } } - out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec)); + + // 对于im2col_step大于1时的bug进行修复 + if (im2col_step > 1) { + std::vector axis(4); + axis[0] = 0; + axis[1] = 2; + axis[2] = 1; + axis[3] = 3; + + DenseTensor real_output_buffer = phi::Transpose( + dev_ctx, + output_4d.Resize( + phi::make_ddim({batch_size / im2col_step, + output_shape_vec[1], + im2col_step, + output_shape_vec[2] * output_shape_vec[3]})), + axis); + + out->ShareDataWith(real_output_buffer) + .Resize(phi::make_ddim(output_shape_vec)); + } else { + out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec)); + } } } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py index 57afd9f45aa27..ea094d0406d83 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py @@ -22,7 +22,6 @@ from op_test import OpTest from paddle.fluid.framework import _test_eager_guard - paddle.enable_static() @@ -204,7 +203,6 @@ def init_type(self): self.dtype = np.float32 - class TestWithStride(TestModulatedDeformableConvOp): def init_test_case(self): @@ -302,7 +300,6 @@ def test_check_output(self): class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase): def test_error(self): - def test_invalid_input(): input = [1, 3, 32, 32] offset = fluid.data(name='offset', From 9a28eff856abe1a94428bb23ac07aebd6cb35c80 Mon Sep 17 00:00:00 2001 From: ray96 Date: Mon, 19 Sep 2022 11:12:18 +0800 Subject: [PATCH 06/23] modify file --- paddle/phi/kernels/gpu/deformable_conv_kernel.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu index 31348fbb2c28d..52972a951c0a8 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu @@ -25,8 +25,4 @@ PD_REGISTER_KERNEL(deformable_conv, float, double, phi::dtype::float16) { - if (kernel_key.dtype() == phi::DataType::FLOAT16) { - kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); - kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32); - } } From c56849fddcae5bcde6c2262272e3648d75c8d601 Mon Sep 17 00:00:00 2001 From: ray96 Date: Mon, 19 Sep 2022 11:50:22 +0800 Subject: [PATCH 07/23] modify file --- paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu index a87da6102f44a..1d30cb73607a9 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu @@ -109,9 +109,8 @@ __global__ void ModulatedDeformableCol2imGpuKernel( width); // 慢在这 ,不支持float16 - *(grad_im + cur_bottom_grad_pos) = - *(grad_im + cur_bottom_grad_pos) + - static_cast(weight * cur_top_grad); + paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos, + static_cast(weight * cur_top_grad)); } } } From 7683477b0277e8d3b01d3c261aab7548bd5ccf72 Mon Sep 17 00:00:00 2001 From: ray96 Date: Tue, 20 Sep 2022 15:53:26 +0800 Subject: [PATCH 08/23] use fastAtomicAdd --- paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu index 1d30cb73607a9..27aec188e38fa 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu @@ -107,10 +107,11 @@ __global__ void ModulatedDeformableCol2imGpuKernel( cur_w + dx, height, width); - - // 慢在这 ,不支持float16 - paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos, - static_cast(weight * cur_top_grad)); + paddle::platform::fastAtomicAdd( + grad_im, + cur_bottom_grad_pos, + nthreads, + static_cast(weight * cur_top_grad)); } } } @@ -365,4 +366,4 @@ PD_REGISTER_KERNEL(deformable_conv_grad, phi::DeformableConvGradKernel, float, double, - phi::dtype::float16) {} + paddle::platform::float16) {} From ec5644ff58bd01dc2a51ded96f805a16b6496625 Mon Sep 17 00:00:00 2001 From: ray96 Date: Wed, 21 Sep 2022 12:17:23 +0800 Subject: [PATCH 09/23] acc for ModulatedDeformableCol2im --- .../gpu/deformable_conv_grad_kernel.cu | 34 ++++++------- .../impl/deformable_conv_grad_kernel_impl.h | 48 ++++++++----------- 2 files changed, 36 insertions(+), 46 deletions(-) diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu index 27aec188e38fa..d0c820460b202 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu @@ -29,7 +29,7 @@ static inline int NumBlocks(const int N) { kNumMaximumNumBlocks); } -template +template __global__ void ModulatedDeformableCol2imGpuKernel( const int nthreads, const T* data_col, @@ -51,10 +51,10 @@ __global__ void ModulatedDeformableCol2imGpuKernel( const int deformable_group, const int height_col, const int width_col, - T* grad_im) { + MT* grad_im) { int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; - using MT = typename phi::dtype::MPTypeTrait::Type; + // using MT = typename phi::dtype::MPTypeTrait::Type; for (size_t thread = index; thread < nthreads; thread += offset) { const int j = (thread / width_col / height_col / batch_size) % kernel_w; const int i = @@ -107,18 +107,15 @@ __global__ void ModulatedDeformableCol2imGpuKernel( cur_w + dx, height, width); - paddle::platform::fastAtomicAdd( - grad_im, - cur_bottom_grad_pos, - nthreads, - static_cast(weight * cur_top_grad)); + paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos, + weight * cur_top_grad); } } } } } -template +template void ModulatedDeformableCol2im(const Context& dev_ctx, const T* data_col, const T* data_offset, @@ -130,13 +127,13 @@ void ModulatedDeformableCol2im(const Context& dev_ctx, const std::vector& stride, const std::vector& dilation, const int deformable_group, - T* grad_im) { + MT* grad_im) { int channel_per_deformable_group = im_shape[0] / deformable_group; int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; int blocks = NumBlocks(num_kernels); int threads = kNumCUDAThreads; - ModulatedDeformableCol2imGpuKernel + ModulatedDeformableCol2imGpuKernel <<>>(num_kernels, data_col, data_offset, @@ -243,14 +240,13 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel( if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { - mval += - static_cast(data_col_ptr[col_pos]) * - funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, - width, - height, - width, - inv_h, - inv_w); + mval += static_cast(data_col_ptr[col_pos]) * + funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, + width, + height, + width, + inv_h, + inv_w); } const MT weight = DmcnGetCoordinateWeight(inv_h, diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h index 90a5243fc72d4..ccbe3aa21b4e7 100644 --- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" +#include "paddle/phi/common/amp_type_traits.h" namespace phi { @@ -82,46 +82,38 @@ HOSTDEVICE MT DmcnGetCoordinateWeight(MT argmax_h, if (bp_dir == 0) { weight += (argmax_h_low >= 0 && argmax_w_low >= 0) ? -1 * (argmax_w_low + 1 - argmax_w) * - static_cast( - im_data[argmax_h_low * data_width + argmax_w_low]) + static_cast(im_data[argmax_h_low * data_width + argmax_w_low]) : 0; weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1) ? -1 * (argmax_w - argmax_w_low) * - static_cast( - im_data[argmax_h_low * data_width + argmax_w_high]) + static_cast(im_data[argmax_h_low * data_width + argmax_w_high]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0) ? (argmax_w_low + 1 - argmax_w) * - static_cast( - im_data[argmax_h_high * data_width + argmax_w_low]) + static_cast(im_data[argmax_h_high * data_width + argmax_w_low]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) ? (argmax_w - argmax_w_low) * - static_cast( - im_data[argmax_h_high * data_width + argmax_w_high]) + static_cast(im_data[argmax_h_high * data_width + argmax_w_high]) : 0; } else if (bp_dir == 1) { weight += (argmax_h_low >= 0 && argmax_w_low >= 0) ? -1 * (argmax_h_low + 1 - argmax_h) * - static_cast( - im_data[argmax_h_low * data_width + argmax_w_low]) + static_cast(im_data[argmax_h_low * data_width + argmax_w_low]) : 0; weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1) ? (argmax_h_low + 1 - argmax_h) * - static_cast( - im_data[argmax_h_low * data_width + argmax_w_high]) + static_cast(im_data[argmax_h_low * data_width + argmax_w_high]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0) ? -1 * (argmax_h - argmax_h_low) * - static_cast( - im_data[argmax_h_high * data_width + argmax_w_low]) + static_cast(im_data[argmax_h_high * data_width + argmax_w_low]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) ? (argmax_h - argmax_h_low) * - static_cast( - im_data[argmax_h_high * data_width + argmax_w_high]) + static_cast(im_data[argmax_h_high * data_width + argmax_w_high]) : 0; } @@ -144,7 +136,7 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx, T* grad_offset, T* grad_mask); -template +template void ModulatedDeformableCol2im(const Context& dev_ctx, const T* data_col, const T* data_offset, @@ -156,7 +148,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx, const std::vector& stride, const std::vector& dilation, const int deformable_group, - T* grad_im); + MT* grad_im); template void FilterGradAddup(const Context& dev_ctx, @@ -167,6 +159,7 @@ void FilterGradAddup(const Context& dev_ctx, const T* dweight_3d, T* filter_grad); +// 总入口 template void DeformableConvGradKernel(const Context& dev_ctx, const DenseTensor& x, @@ -185,7 +178,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, DenseTensor* filter_grad, DenseTensor* mask_grad) { const int batch_size = static_cast(x.dims()[0]); - + using MT = typename phi::dtype::MPTypeTrait::Type; DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size()); std::vector input_shape_vec = phi::vectorize(input_shape); std::vector filter_shape_vec(phi::vectorize(filter.dims())); @@ -270,7 +263,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, T(1.0), &col_buffer_3d_slice, T(0.0)); - } + } // 相信没啥问题 col_buffer.Resize(make_ddim(col_buffer_shape_vec)); T* col_buffer_ptr = col_buffer.data(); @@ -299,10 +292,11 @@ void DeformableConvGradKernel(const Context& dev_ctx, deformable_groups, offset_grad_ptr + i * im2col_step * input_offset_dim, mask_grad_data_ptr); - } // check + } //check if (dx) { - T* dx_ptr = dx->data(); - // get grad of input + + MT* mt_dx_ptr = dev_ctx.template Alloc(dx); + ModulatedDeformableCol2im(dev_ctx, col_buffer_ptr, offset_ptr + i * im2col_step * input_offset_dim, @@ -314,7 +308,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, strides, dilations, deformable_groups, - dx_ptr + i * im2col_step * input_dim); //待改 + mt_dx_ptr + i * im2col_step * input_dim); dx->Resize(x.dims()); } @@ -330,7 +324,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, strides, dilations, deformable_groups, - col_buffer_ptr); // check + col_buffer_ptr); //check col_buffer_3d.Resize(col_buffer_3d_shape); @@ -362,7 +356,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, K, M, dweight_3d.data(), - filter_grad->data()); // 待改 + filter_grad->data()); } } if (filter_grad) { From 4538bb5d927ff93e6790de3b302bccea50b15376 Mon Sep 17 00:00:00 2001 From: ray96 Date: Thu, 22 Sep 2022 20:43:42 +0800 Subject: [PATCH 10/23] modify codestyle --- .../cpu/deformable_conv_grad_kernel.cc | 4 +-- .../gpu/deformable_conv_grad_kernel.cu | 15 +++++---- .../impl/deformable_conv_grad_kernel_impl.h | 33 +++++++++++-------- .../unittests/test_deformable_conv_v1_op.py | 1 + 4 files changed, 31 insertions(+), 22 deletions(-) diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc index 962576154a7bf..537f0de367693 100644 --- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc @@ -106,7 +106,7 @@ inline void ModulatedDeformableCol2imCPUKernel( } } -template +template void ModulatedDeformableCol2im(const Context& dev_ctx, const T* data_col, const T* data_offset, @@ -118,7 +118,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx, const std::vector& stride, const std::vector& dilation, const int deformable_group, - T* grad_im) { + MT* grad_im) { int channel_per_deformable_group = im_shape[0] / deformable_group; int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu index d0c820460b202..5d2f4727c53e2 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu @@ -240,13 +240,14 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel( if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { - mval += static_cast(data_col_ptr[col_pos]) * - funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, - width, - height, - width, - inv_h, - inv_w); + mval += + static_cast(data_col_ptr[col_pos]) * + funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, + width, + height, + width, + inv_h, + inv_w); } const MT weight = DmcnGetCoordinateWeight(inv_h, diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h index ccbe3aa21b4e7..af620de132cfa 100644 --- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h @@ -14,13 +14,13 @@ #pragma once +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" -#include "paddle/phi/common/amp_type_traits.h" namespace phi { @@ -82,38 +82,46 @@ HOSTDEVICE MT DmcnGetCoordinateWeight(MT argmax_h, if (bp_dir == 0) { weight += (argmax_h_low >= 0 && argmax_w_low >= 0) ? -1 * (argmax_w_low + 1 - argmax_w) * - static_cast(im_data[argmax_h_low * data_width + argmax_w_low]) + static_cast( + im_data[argmax_h_low * data_width + argmax_w_low]) : 0; weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1) ? -1 * (argmax_w - argmax_w_low) * - static_cast(im_data[argmax_h_low * data_width + argmax_w_high]) + static_cast( + im_data[argmax_h_low * data_width + argmax_w_high]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0) ? (argmax_w_low + 1 - argmax_w) * - static_cast(im_data[argmax_h_high * data_width + argmax_w_low]) + static_cast( + im_data[argmax_h_high * data_width + argmax_w_low]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) ? (argmax_w - argmax_w_low) * - static_cast(im_data[argmax_h_high * data_width + argmax_w_high]) + static_cast( + im_data[argmax_h_high * data_width + argmax_w_high]) : 0; } else if (bp_dir == 1) { weight += (argmax_h_low >= 0 && argmax_w_low >= 0) ? -1 * (argmax_h_low + 1 - argmax_h) * - static_cast(im_data[argmax_h_low * data_width + argmax_w_low]) + static_cast( + im_data[argmax_h_low * data_width + argmax_w_low]) : 0; weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1) ? (argmax_h_low + 1 - argmax_h) * - static_cast(im_data[argmax_h_low * data_width + argmax_w_high]) + static_cast( + im_data[argmax_h_low * data_width + argmax_w_high]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0) ? -1 * (argmax_h - argmax_h_low) * - static_cast(im_data[argmax_h_high * data_width + argmax_w_low]) + static_cast( + im_data[argmax_h_high * data_width + argmax_w_low]) : 0; weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) ? (argmax_h - argmax_h_low) * - static_cast(im_data[argmax_h_high * data_width + argmax_w_high]) + static_cast( + im_data[argmax_h_high * data_width + argmax_w_high]) : 0; } @@ -263,7 +271,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, T(1.0), &col_buffer_3d_slice, T(0.0)); - } // 相信没啥问题 + } // 相信没啥问题 col_buffer.Resize(make_ddim(col_buffer_shape_vec)); T* col_buffer_ptr = col_buffer.data(); @@ -292,9 +300,8 @@ void DeformableConvGradKernel(const Context& dev_ctx, deformable_groups, offset_grad_ptr + i * im2col_step * input_offset_dim, mask_grad_data_ptr); - } //check + } // check if (dx) { - MT* mt_dx_ptr = dev_ctx.template Alloc(dx); ModulatedDeformableCol2im(dev_ctx, @@ -324,7 +331,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, strides, dilations, deformable_groups, - col_buffer_ptr); //check + col_buffer_ptr); // check col_buffer_3d.Resize(col_buffer_3d_shape); diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py index ea094d0406d83..6e70a8487e6a6 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py @@ -300,6 +300,7 @@ def test_check_output(self): class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase): def test_error(self): + def test_invalid_input(): input = [1, 3, 32, 32] offset = fluid.data(name='offset', From e0c75aff4beda947563dbbf45d20db77acb026de Mon Sep 17 00:00:00 2001 From: ray96 Date: Mon, 26 Sep 2022 16:36:37 +0800 Subject: [PATCH 11/23] remove useless notes --- .../kernels/funcs/deformable_conv_functor.cu | 2 -- .../impl/deformable_conv_grad_kernel_impl.h | 7 ++--- .../impl/deformable_conv_kernel_impl.h | 28 ++++++------------- 3 files changed, 12 insertions(+), 25 deletions(-) diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu index 7ae2526406f2b..8a23c0d26d424 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu @@ -82,8 +82,6 @@ __global__ void ModulatedDeformableIm2colGpuKernel( kernel_h * kernel_w * height_col * width_col : nullptr; - // 上边都动不了,指针移动也不会引起误差和速度降低 - for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { const int data_offset_h_ptr = diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h index af620de132cfa..4828a02cfefa0 100644 --- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h @@ -167,7 +167,6 @@ void FilterGradAddup(const Context& dev_ctx, const T* dweight_3d, T* filter_grad); -// 总入口 template void DeformableConvGradKernel(const Context& dev_ctx, const DenseTensor& x, @@ -271,7 +270,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, T(1.0), &col_buffer_3d_slice, T(0.0)); - } // 相信没啥问题 + } col_buffer.Resize(make_ddim(col_buffer_shape_vec)); T* col_buffer_ptr = col_buffer.data(); @@ -300,7 +299,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, deformable_groups, offset_grad_ptr + i * im2col_step * input_offset_dim, mask_grad_data_ptr); - } // check + } if (dx) { MT* mt_dx_ptr = dev_ctx.template Alloc(dx); @@ -331,7 +330,7 @@ void DeformableConvGradKernel(const Context& dev_ctx, strides, dilations, deformable_groups, - col_buffer_ptr); // check + col_buffer_ptr); col_buffer_3d.Resize(col_buffer_3d_shape); diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h index 450d7a48e2bfb..d66f4e58e5b61 100644 --- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h +++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h @@ -64,9 +64,8 @@ void DeformableConvKernel(const Context& dev_ctx, DenseTensor col_buffer = Empty(dev_ctx, col_buffer_shape_vec); DenseTensor output_buffer = Empty(dev_ctx, output_buffer_shape_vec); - int64_t M = output_shape_vec[1] / groups; // 4 : C - int64_t N = im2col_step * output_shape_vec[2] * - output_shape_vec[3]; // 2*3*3 :im2Step * H * W + int64_t M = output_shape_vec[1] / groups; + int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; int64_t K = x.dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; DenseTensor weight_3d; @@ -76,18 +75,11 @@ void DeformableConvKernel(const Context& dev_ctx, col_buffer_3d.ShareDataWith(col_buffer) .Resize(phi::make_ddim({groups, K, N})); - DenseTensor output_4d; //计算用 这里需要分配个空内存 + DenseTensor output_4d; output_4d.ShareDataWith(output_buffer) - .Resize( - phi::make_ddim({batch_size / im2col_step, - groups, - M, - N})); // 3 * 1 * 4 * (2*3*3) : mini_batch * group * - // C/group * (im2stap * H * W) 3 * 1 * 4 * - // (2*3*3) : mini_batch * C * (im2stap * H * W) - - DDim input_shape = - phi::slice_ddim(x.dims(), 1, x.dims().size()); //单张图片大小:C*H*W + .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N})); + + DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size()); std::vector input_shape_vec = phi::vectorize(input_shape); int input_dim = x.numel() / x.dims()[0]; @@ -125,9 +117,7 @@ void DeformableConvKernel(const Context& dev_ctx, // get the product of pixel and weight for (int g = 0; g < groups; ++g) { DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(weight_3d.dims(), - 1, - weight_3d.dims().size())); //等于是把第0维去掉 + phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size())); DenseTensor col_buffer_3d_slice = col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim( col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); @@ -135,7 +125,7 @@ void DeformableConvKernel(const Context& dev_ctx, output_3d.Slice(g, g + 1).Resize(phi::slice_ddim( output_3d.dims(), 1, - output_3d.dims().size())); // 4*32:C * ((im2col_step)*H*W)) + output_3d.dims().size())); // C * ((im2col_step)*H*W)) blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, @@ -146,7 +136,7 @@ void DeformableConvKernel(const Context& dev_ctx, } } - // 对于im2col_step大于1时的bug进行修复 + // swap axis to get the right result when im2col_step is greater than 1 if (im2col_step > 1) { std::vector axis(4); axis[0] = 0; From ab8aeec150e440a45f9bbbe105a46a3c53017fc4 Mon Sep 17 00:00:00 2001 From: Rayman Date: Thu, 29 Sep 2022 11:40:28 +0800 Subject: [PATCH 12/23] Update deformable_conv_kernel.cu --- paddle/phi/kernels/gpu/deformable_conv_kernel.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu index 52972a951c0a8..021791ca93061 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu @@ -24,5 +24,4 @@ PD_REGISTER_KERNEL(deformable_conv, phi::DeformableConvKernel, float, double, - phi::dtype::float16) { -} + phi::dtype::float16) {} From b697128fd55b573fb992b887736912497f287f7e Mon Sep 17 00:00:00 2001 From: ray96 Date: Sat, 8 Oct 2022 09:52:14 +0800 Subject: [PATCH 13/23] modify unit_test --- .../fluid/tests/unittests/test_deformable_conv_v1_op.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py index 58147d9e038b5..6ce1aaaa5d79c 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py @@ -294,11 +294,17 @@ def init_type(self): def test_check_output(self): self.check_output(check_eager=True, atol=1e-3) + def test_check_grad(self): + self.check_grad_with_place(core.CUDAPlace(0), ['Input', 'Offset', 'Filter'], + 'Output', + numeric_grad_delta=1e-3, + max_relative_error=1e-3, + check_eager=True) + class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase): def test_error(self): - def test_invalid_input(): input = [1, 3, 32, 32] offset = fluid.data(name='offset', From 242d9a6b9bddea874a512c54a4936c397ae1983e Mon Sep 17 00:00:00 2001 From: ray96 Date: Sat, 8 Oct 2022 21:46:26 +0800 Subject: [PATCH 14/23] modify test --- .../fluid/tests/unittests/test_deformable_conv_v1_op.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py index 6ce1aaaa5d79c..d729c4bdf3d9c 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py @@ -286,6 +286,8 @@ def init_type(self): self.dtype = np.float64 +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestWithFloat16(TestModulatedDeformableConvOp): def init_type(self): @@ -294,13 +296,6 @@ def init_type(self): def test_check_output(self): self.check_output(check_eager=True, atol=1e-3) - def test_check_grad(self): - self.check_grad_with_place(core.CUDAPlace(0), ['Input', 'Offset', 'Filter'], - 'Output', - numeric_grad_delta=1e-3, - max_relative_error=1e-3, - check_eager=True) - class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase): From daf5b2a2e921a5418b83fd54b8c0492fdf68b0f2 Mon Sep 17 00:00:00 2001 From: ray96 Date: Sun, 9 Oct 2022 09:06:34 +0800 Subject: [PATCH 15/23] modify test --- .../unittests/test_deformable_conv_v1_op.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py index d729c4bdf3d9c..54705e108ed41 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py @@ -296,10 +296,26 @@ def init_type(self): def test_check_output(self): self.check_output(check_eager=True, atol=1e-3) + def test_check_grad(self): + self.check_grad(['Input', 'Filter'], + 'Output', + numeric_grad_delta=3e-1, + max_relative_error=4e-1, + check_eager=True) + + def test_check_grad_no_filter(self): + self.check_grad(['Input'], + 'Output', + numeric_grad_delta=3e-1, + max_relative_error=2e-1, + no_grad_set=set(['Filter']), + check_eager=True) + class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase): def test_error(self): + def test_invalid_input(): input = [1, 3, 32, 32] offset = fluid.data(name='offset', From f1b463afa9ec16a911b4956202866d38293fb07f Mon Sep 17 00:00:00 2001 From: ray96 Date: Sun, 9 Oct 2022 11:53:35 +0800 Subject: [PATCH 16/23] modify when type16 --- .../kernels/funcs/deformable_conv_functor.cc | 1 - .../kernels/funcs/deformable_conv_functor.cu | 2 +- .../impl/deformable_conv_grad_kernel_impl.h | 51 ++++++++++++++----- 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc index f6483b9cdd2e8..d5b99760f9a45 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" -#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/amp_type_traits.h" diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu index 8a23c0d26d424..813d549136084 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" -#include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/device_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/float16.h" diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h index 4828a02cfefa0..ab6d226b03c56 100644 --- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h @@ -21,6 +21,7 @@ #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" +#include "paddle/phi/kernels/cast_kernel.h" namespace phi { @@ -301,21 +302,43 @@ void DeformableConvGradKernel(const Context& dev_ctx, mask_grad_data_ptr); } if (dx) { - MT* mt_dx_ptr = dev_ctx.template Alloc(dx); - - ModulatedDeformableCol2im(dev_ctx, - col_buffer_ptr, - offset_ptr + i * im2col_step * input_offset_dim, - mask_data_ptr, - input_shape_vec, - col_buffer_shape_vec, - filter_shape_vec, - paddings, - strides, - dilations, - deformable_groups, - mt_dx_ptr + i * im2col_step * input_dim); + if (x.dtype() == DataType::FLOAT16){ + DenseTensor mt_dx = phi::EmptyLike(dev_ctx, *dx); + MT* mt_dx_ptr = dev_ctx.template Alloc(&mt_dx); + + ModulatedDeformableCol2im(dev_ctx, + col_buffer_ptr, + offset_ptr + i * im2col_step * input_offset_dim, + mask_data_ptr, + input_shape_vec, + col_buffer_shape_vec, + filter_shape_vec, + paddings, + strides, + dilations, + deformable_groups, + mt_dx_ptr + i * im2col_step * input_dim); + + DenseTensor t_dx = phi::Cast(dev_ctx, mt_dx, x.dtype()); + dx->ShareDataWith(t_dx); + } else { + MT* mt_dx_ptr = dev_ctx.template Alloc(dx); + ModulatedDeformableCol2im(dev_ctx, + col_buffer_ptr, + offset_ptr + i * im2col_step * input_offset_dim, + mask_data_ptr, + input_shape_vec, + col_buffer_shape_vec, + filter_shape_vec, + paddings, + strides, + dilations, + deformable_groups, + mt_dx_ptr + i * im2col_step * input_dim); + + } dx->Resize(x.dims()); + } funcs::ModulatedDeformableIm2col( From 916abf16d57fe74a812ec8618f6b8ae0d7c178f2 Mon Sep 17 00:00:00 2001 From: ray96 Date: Sun, 9 Oct 2022 13:19:01 +0800 Subject: [PATCH 17/23] skip grad check for fp16 --- .../cpu/deformable_conv_grad_kernel.cc | 76 +++++++++---------- .../kernels/funcs/deformable_conv_functor.cc | 16 ++-- .../unittests/test_deformable_conv_v1_op.py | 18 ++--- 3 files changed, 48 insertions(+), 62 deletions(-) diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc index 537f0de367693..d38310360d0ae 100644 --- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc @@ -43,7 +43,6 @@ inline void ModulatedDeformableCol2imCPUKernel( const int height_col, const int width_col, T* grad_im) { - using MT = typename phi::dtype::MPTypeTrait::Type; for (int thread = 0; thread < num_kernels; thread++) { const int j = (thread / width_col / height_col / batch_size) % kernel_w; const int i = @@ -68,17 +67,17 @@ inline void ModulatedDeformableCol2imCPUKernel( ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; - const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); - const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); - const MT cur_inv_h_data = h_in + i * dilation_h + offset_h; - const MT cur_inv_w_data = w_in + j * dilation_w + offset_w; + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T cur_inv_h_data = h_in + i * dilation_h + offset_h; + const T cur_inv_w_data = w_in + j * dilation_w + offset_w; - MT cur_top_grad = static_cast(data_col[thread]); + T cur_top_grad = data_col[thread]; if (data_mask) { const T* data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; - const MT mask = static_cast(data_mask_ptr[data_mask_hw_ptr]); + const T mask = data_mask_ptr[data_mask_hw_ptr]; cur_top_grad *= mask; } const int cur_h = static_cast(cur_inv_h_data); @@ -90,16 +89,15 @@ inline void ModulatedDeformableCol2imCPUKernel( abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; - MT weight = DmcnGetGradientWeight(cur_inv_h_data, - cur_inv_w_data, - cur_h + dy, - cur_w + dx, - height, - width); + T weight = DmcnGetGradientWeight(cur_inv_h_data, + cur_inv_w_data, + cur_h + dy, + cur_w + dx, + height, + width); *(grad_im + cur_bottom_grad_pos) = - *(grad_im + cur_bottom_grad_pos) + - static_cast(weight * cur_top_grad); + *(grad_im + cur_bottom_grad_pos) + (weight * cur_top_grad); } } } @@ -171,9 +169,8 @@ void ModulatedDeformableCol2imCoordCPUKernel( const int width_col, T* grad_offset, T* grad_mask) { - using MT = typename phi::dtype::MPTypeTrait::Type; for (int i = 0; i < num_kernels; i++) { - MT val = 0, mval = 0; + T val = 0, mval = 0; const int w = i % width_col; const int h = (i / width_col) % height_col; const int c = (i / width_col / height_col) % offset_channels; @@ -218,37 +215,36 @@ void ModulatedDeformableCol2imCoordCPUKernel( const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); - const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); - const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); - MT inv_h = h_in + i * dilation_h + offset_h; - MT inv_w = w_in + j * dilation_w + offset_w; + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + T inv_h = h_in + i * dilation_h + offset_h; + T inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { - mval += - static_cast(data_col_ptr[col_pos]) * - funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, - width, - height, - width, - inv_h, - inv_w); + mval += data_col_ptr[col_pos] * funcs::DmcnIm2colBilinear( + data_im_ptr + cnt * height * width, + width, + height, + width, + inv_h, + inv_w); } - const MT weight = - DmcnGetCoordinateWeight(inv_h, - inv_w, - height, - width, - data_im_ptr + cnt * height * width, - width, - bp_dir); + const T weight = + DmcnGetCoordinateWeight(inv_h, + inv_w, + height, + width, + data_im_ptr + cnt * height * width, + width, + bp_dir); if (data_mask_ptr) { const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); - const MT mask = static_cast(data_mask_ptr[data_mask_hw_ptr]); - val += weight * static_cast(data_col_ptr[col_pos]) * mask; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + val += weight * data_col_ptr[col_pos] * mask; } else { - val += weight * static_cast(data_col_ptr[col_pos]); + val += weight * data_col_ptr[col_pos]; } cnt += 1; } diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc index d5b99760f9a45..226fa51bc335b 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" -#include "paddle/fluid/platform/float16.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/amp_type_traits.h" @@ -43,7 +42,6 @@ inline void ModulatedDeformableIm2colCPUKernel( const int height_col, const int width_col, T* data_col) { - using MT = typename phi::dtype::MPTypeTrait::Type; for (int i = 0; i < num_kernels; i++) { const int w_col = i % width_col; const int h_col = (i / width_col) % height_col; @@ -78,19 +76,19 @@ inline void ModulatedDeformableIm2colCPUKernel( ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; - const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); - const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); - MT val = static_cast(0); - const MT h_im = h_in + i * dilation_h + offset_h; - const MT w_im = w_in + j * dilation_w + offset_w; + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + T val = 0; + const T h_im = h_in + i * dilation_h + offset_h; + const T w_im = w_in + j * dilation_w + offset_w; if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { - val = DmcnIm2colBilinear( + val = DmcnIm2colBilinear( data_im_ptr, width, height, width, h_im, w_im); } if (data_mask_ptr) { const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; - const MT mask = static_cast(data_mask_ptr[data_mask_hw_ptr]); + const T mask = data_mask_ptr[data_mask_hw_ptr]; val *= mask; } *data_col_ptr = static_cast(val); diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py index 54705e108ed41..c800881ee98a9 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py @@ -17,7 +17,7 @@ import numpy as np import paddle.fluid as fluid import paddle.fluid.core as core -from op_test import OpTest +from op_test import OpTest, skip_check_grad_ci from paddle.fluid.framework import _test_eager_guard paddle.enable_static() @@ -288,6 +288,8 @@ def init_type(self): @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") +@skip_check_grad_ci( + reason="Grad check for deform-fp16 need to be compared with fp32, will be included in another PR") class TestWithFloat16(TestModulatedDeformableConvOp): def init_type(self): @@ -297,25 +299,15 @@ def test_check_output(self): self.check_output(check_eager=True, atol=1e-3) def test_check_grad(self): - self.check_grad(['Input', 'Filter'], - 'Output', - numeric_grad_delta=3e-1, - max_relative_error=4e-1, - check_eager=True) + pass def test_check_grad_no_filter(self): - self.check_grad(['Input'], - 'Output', - numeric_grad_delta=3e-1, - max_relative_error=2e-1, - no_grad_set=set(['Filter']), - check_eager=True) + pass class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase): def test_error(self): - def test_invalid_input(): input = [1, 3, 32, 32] offset = fluid.data(name='offset', From f1fec53289d0f2640aad79ee5460767ad132758d Mon Sep 17 00:00:00 2001 From: ray96 Date: Sun, 9 Oct 2022 13:28:37 +0800 Subject: [PATCH 18/23] code style --- .../kernels/funcs/deformable_conv_functor.cu | 2 +- .../impl/deformable_conv_grad_kernel_impl.h | 56 +++++++++---------- .../unittests/test_deformable_conv_v1_op.py | 1 + 3 files changed, 30 insertions(+), 29 deletions(-) diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu index 813d549136084..0d5076a4937c3 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" -#include "paddle/phi/core/device_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/float16.h" +#include "paddle/phi/core/device_context.h" namespace phi { namespace funcs { diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h index ab6d226b03c56..d68619728cd1f 100644 --- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h @@ -17,11 +17,11 @@ #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/cast_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" -#include "paddle/phi/kernels/cast_kernel.h" namespace phi { @@ -302,43 +302,43 @@ void DeformableConvGradKernel(const Context& dev_ctx, mask_grad_data_ptr); } if (dx) { - if (x.dtype() == DataType::FLOAT16){ + if (x.dtype() == DataType::FLOAT16) { DenseTensor mt_dx = phi::EmptyLike(dev_ctx, *dx); MT* mt_dx_ptr = dev_ctx.template Alloc(&mt_dx); - ModulatedDeformableCol2im(dev_ctx, - col_buffer_ptr, - offset_ptr + i * im2col_step * input_offset_dim, - mask_data_ptr, - input_shape_vec, - col_buffer_shape_vec, - filter_shape_vec, - paddings, - strides, - dilations, - deformable_groups, - mt_dx_ptr + i * im2col_step * input_dim); + ModulatedDeformableCol2im( + dev_ctx, + col_buffer_ptr, + offset_ptr + i * im2col_step * input_offset_dim, + mask_data_ptr, + input_shape_vec, + col_buffer_shape_vec, + filter_shape_vec, + paddings, + strides, + dilations, + deformable_groups, + mt_dx_ptr + i * im2col_step * input_dim); DenseTensor t_dx = phi::Cast(dev_ctx, mt_dx, x.dtype()); dx->ShareDataWith(t_dx); } else { MT* mt_dx_ptr = dev_ctx.template Alloc(dx); - ModulatedDeformableCol2im(dev_ctx, - col_buffer_ptr, - offset_ptr + i * im2col_step * input_offset_dim, - mask_data_ptr, - input_shape_vec, - col_buffer_shape_vec, - filter_shape_vec, - paddings, - strides, - dilations, - deformable_groups, - mt_dx_ptr + i * im2col_step * input_dim); - + ModulatedDeformableCol2im( + dev_ctx, + col_buffer_ptr, + offset_ptr + i * im2col_step * input_offset_dim, + mask_data_ptr, + input_shape_vec, + col_buffer_shape_vec, + filter_shape_vec, + paddings, + strides, + dilations, + deformable_groups, + mt_dx_ptr + i * im2col_step * input_dim); } dx->Resize(x.dims()); - } funcs::ModulatedDeformableIm2col( diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py index c800881ee98a9..b41ef28c79b14 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py @@ -308,6 +308,7 @@ def test_check_grad_no_filter(self): class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase): def test_error(self): + def test_invalid_input(): input = [1, 3, 32, 32] offset = fluid.data(name='offset', From e2c7d3846b4972234e499091a46fe5efd9e079ac Mon Sep 17 00:00:00 2001 From: ray96 Date: Sun, 9 Oct 2022 13:43:00 +0800 Subject: [PATCH 19/23] add fp16 grad check --- .../unittests/test_deformable_conv_v1_op.py | 69 +++++++++++++++---- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py index b41ef28c79b14..b60a818c447ad 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py @@ -285,25 +285,64 @@ class TestWithDouble(TestModulatedDeformableConvOp): def init_type(self): self.dtype = np.float64 +class TestFP16(unittest.TestCase): + + def check_main(self, input_np, offset_np, filter_np, dtype): + paddle.disable_static() + input_np = input_np.astype(dtype) + offset_np = offset_np.astype(dtype) + filter_np = filter_np.astype(dtype) + + input = paddle.to_tensor(input_np) + offset = paddle.to_tensor(offset_np) + filter = paddle.to_tensor(filter_np) + + input.stop_gradient = False + offset.stop_gradient = False + filter.stop_gradient = False + + y = paddle.vision.ops.deform_conv2d(input, offset, filter) + input_grad, offset_grad, filter_grad = paddle.grad(y, [input, offset, filter]) + y_np = y.numpy().astype('float32') + input_grad_np = input_grad.numpy().astype('float32') + offset_grad_np = offset_grad.numpy().astype('float32') + filter_grad_np = filter_grad.numpy().astype('float32') + paddle.enable_static() + return y_np, input_grad_np, offset_grad_np, filter_grad_np + + def test_main(self): + if not paddle.is_compiled_with_cuda(): + return + self.pad = [1, 1] + self.stride = [1, 1] + self.dilations = [1, 1] + self.groups = 1 + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [40, f_c, 1, 1] + self.im2col_step = 1 + self.deformable_groups = 1 + offset_c = 2 * self.deformable_groups * self.filter_size[ + 2] * self.filter_size[3] + self.offset_size = [ + self.input_size[0], offset_c, self.input_size[2], self.input_size[3] + ] -@unittest.skipIf(not core.is_compiled_with_cuda(), - "core is not compiled with CUDA") -@skip_check_grad_ci( - reason="Grad check for deform-fp16 need to be compared with fp32, will be included in another PR") -class TestWithFloat16(TestModulatedDeformableConvOp): - - def init_type(self): - self.dtype = np.float16 - - def test_check_output(self): - self.check_output(check_eager=True, atol=1e-3) + input = np.random.random(self.input_size) + offset = 10 * np.random.random(self.offset_size) + filter = np.random.random(self.filter_size) - def test_check_grad(self): - pass + y_np_1, input_g_np_1, offset_g_np_1, filter_g_np_1 = self.check_main(input,offset,filter, 'float16') + y_np_2, input_g_np_2, offset_g_np_2, filter_g_np_2 = self.check_main(input,offset,filter, 'float32') - def test_check_grad_no_filter(self): - pass + def assert_equal(x, y): + np.testing.assert_allclose(x, y, atol=3e-2) + assert_equal(y_np_1, y_np_2) + assert_equal(input_g_np_1, input_g_np_2) + assert_equal(offset_g_np_1, offset_g_np_2) + assert_equal(filter_g_np_1, filter_g_np_2) class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase): From 15183b1496739d73aa90bcaaca3a7d9786b5c931 Mon Sep 17 00:00:00 2001 From: ray96 Date: Sun, 9 Oct 2022 13:46:39 +0800 Subject: [PATCH 20/23] code style --- .../fluid/tests/unittests/test_deformable_conv_v1_op.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py index b60a818c447ad..7e55b9e0673aa 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py @@ -285,6 +285,7 @@ class TestWithDouble(TestModulatedDeformableConvOp): def init_type(self): self.dtype = np.float64 + class TestFP16(unittest.TestCase): def check_main(self, input_np, offset_np, filter_np, dtype): @@ -333,8 +334,8 @@ def test_main(self): offset = 10 * np.random.random(self.offset_size) filter = np.random.random(self.filter_size) - y_np_1, input_g_np_1, offset_g_np_1, filter_g_np_1 = self.check_main(input,offset,filter, 'float16') - y_np_2, input_g_np_2, offset_g_np_2, filter_g_np_2 = self.check_main(input,offset,filter, 'float32') + y_np_1, input_g_np_1, offset_g_np_1, filter_g_np_1 = self.check_main(input, offset, filter, 'float16') + y_np_2, input_g_np_2, offset_g_np_2, filter_g_np_2 = self.check_main(input, offset, filter, 'float32') def assert_equal(x, y): np.testing.assert_allclose(x, y, atol=3e-2) @@ -344,6 +345,7 @@ def assert_equal(x, y): assert_equal(offset_g_np_1, offset_g_np_2) assert_equal(filter_g_np_1, filter_g_np_2) + class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase): def test_error(self): From 88192ccc2b5dc13ba60cbda156e40df503aad69d Mon Sep 17 00:00:00 2001 From: ray96 Date: Sun, 9 Oct 2022 16:28:38 +0800 Subject: [PATCH 21/23] code style --- .../fluid/tests/unittests/test_deformable_conv_v1_op.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py index 7e55b9e0673aa..982bc5a379981 100644 --- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py +++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py @@ -303,7 +303,8 @@ def check_main(self, input_np, offset_np, filter_np, dtype): filter.stop_gradient = False y = paddle.vision.ops.deform_conv2d(input, offset, filter) - input_grad, offset_grad, filter_grad = paddle.grad(y, [input, offset, filter]) + input_grad, offset_grad, filter_grad = paddle.grad( + y, [input, offset, filter]) y_np = y.numpy().astype('float32') input_grad_np = input_grad.numpy().astype('float32') offset_grad_np = offset_grad.numpy().astype('float32') @@ -334,8 +335,10 @@ def test_main(self): offset = 10 * np.random.random(self.offset_size) filter = np.random.random(self.filter_size) - y_np_1, input_g_np_1, offset_g_np_1, filter_g_np_1 = self.check_main(input, offset, filter, 'float16') - y_np_2, input_g_np_2, offset_g_np_2, filter_g_np_2 = self.check_main(input, offset, filter, 'float32') + y_np_1, input_g_np_1, offset_g_np_1, filter_g_np_1 = self.check_main( + input, offset, filter, 'float16') + y_np_2, input_g_np_2, offset_g_np_2, filter_g_np_2 = self.check_main( + input, offset, filter, 'float32') def assert_equal(x, y): np.testing.assert_allclose(x, y, atol=3e-2) From c772392ee39bd5344075c3d85bfcb9d98b80f978 Mon Sep 17 00:00:00 2001 From: ray96 Date: Sun, 9 Oct 2022 17:12:53 +0800 Subject: [PATCH 22/23] modify --- paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc | 2 +- paddle/phi/kernels/funcs/deformable_conv_functor.cc | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc index d38310360d0ae..050c61596fee5 100644 --- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc @@ -248,7 +248,7 @@ void ModulatedDeformableCol2imCoordCPUKernel( } cnt += 1; } - grad_offset[i] = static_cast(val); + grad_offset[i] = val; if (grad_mask && offset_c % 2 == 0) grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc index 226fa51bc335b..253a66adfc6a2 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc @@ -78,20 +78,20 @@ inline void ModulatedDeformableIm2colCPUKernel( const T offset_h = data_offset_ptr[data_offset_h_ptr]; const T offset_w = data_offset_ptr[data_offset_w_ptr]; - T val = 0; + T val = static_cast(0); const T h_im = h_in + i * dilation_h + offset_h; const T w_im = w_in + j * dilation_w + offset_w; if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { val = DmcnIm2colBilinear( data_im_ptr, width, height, width, h_im, w_im); } + *data_col_ptr = val; if (data_mask_ptr) { const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; const T mask = data_mask_ptr[data_mask_hw_ptr]; - val *= mask; + *data_col_ptr *= mask; } - *data_col_ptr = static_cast(val); data_col_ptr += batch_size * height_col * width_col; } } From 82ce081a7f9077d8c6ca340a92bcedc5a2a76388 Mon Sep 17 00:00:00 2001 From: ray96 Date: Sun, 9 Oct 2022 22:32:14 +0800 Subject: [PATCH 23/23] modify --- .../impl/deformable_conv_grad_kernel_impl.h | 50 ++++++------------- 1 file changed, 14 insertions(+), 36 deletions(-) diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h index d68619728cd1f..7402a2273365b 100644 --- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h @@ -302,42 +302,20 @@ void DeformableConvGradKernel(const Context& dev_ctx, mask_grad_data_ptr); } if (dx) { - if (x.dtype() == DataType::FLOAT16) { - DenseTensor mt_dx = phi::EmptyLike(dev_ctx, *dx); - MT* mt_dx_ptr = dev_ctx.template Alloc(&mt_dx); - - ModulatedDeformableCol2im( - dev_ctx, - col_buffer_ptr, - offset_ptr + i * im2col_step * input_offset_dim, - mask_data_ptr, - input_shape_vec, - col_buffer_shape_vec, - filter_shape_vec, - paddings, - strides, - dilations, - deformable_groups, - mt_dx_ptr + i * im2col_step * input_dim); - - DenseTensor t_dx = phi::Cast(dev_ctx, mt_dx, x.dtype()); - dx->ShareDataWith(t_dx); - } else { - MT* mt_dx_ptr = dev_ctx.template Alloc(dx); - ModulatedDeformableCol2im( - dev_ctx, - col_buffer_ptr, - offset_ptr + i * im2col_step * input_offset_dim, - mask_data_ptr, - input_shape_vec, - col_buffer_shape_vec, - filter_shape_vec, - paddings, - strides, - dilations, - deformable_groups, - mt_dx_ptr + i * im2col_step * input_dim); - } + MT* mt_dx_ptr = dev_ctx.template Alloc(dx); + + ModulatedDeformableCol2im(dev_ctx, + col_buffer_ptr, + offset_ptr + i * im2col_step * input_offset_dim, + mask_data_ptr, + input_shape_vec, + col_buffer_shape_vec, + filter_shape_vec, + paddings, + strides, + dilations, + deformable_groups, + mt_dx_ptr + i * im2col_step * input_dim); dx->Resize(x.dims()); }