-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
【Hackathon No.56&38】deformable_conv_v1 算子实现 float16 数据类型支持&前向运行加速 #46111
Changes from 14 commits
6b51055
3fa8f48
f6c8424
62990e7
2820e9b
9a28eff
c56849f
7683477
ec5644f
4538bb5
e0c75af
ab8aeec
c331772
b697128
242d9a6
daf5b2a
f1b463a
916abf1
f1fec53
e2c7d38
15183b1
88192cc
c772392
82ce081
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,6 +43,7 @@ inline void ModulatedDeformableCol2imCPUKernel( | |
const int height_col, | ||
const int width_col, | ||
T* grad_im) { | ||
using MT = typename phi::dtype::MPTypeTrait<T>::Type; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 猜测有没有一种可能,单测出错的原因是因为你这里使用了MT? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
好像有可能🤔 我在cc文件里不用MT试试 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
「人工校验」的方式,是说和python实现的reference版本结果进行对比?确保如下2个方面: 另外,conv类算子可能本身误差较大,我看fp32的单测,max_relative_error已经设置到0.05、0.1这么大了。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
不是的,是直接使用c++版本的float32和float16二者的grad打印出来进行比较,可以参考这里的截图。https://github.com/PaddlePaddle/Paddle/pull/46111#issuecomment-1253283724。 |
||
for (int thread = 0; thread < num_kernels; thread++) { | ||
const int j = (thread / width_col / height_col / batch_size) % kernel_w; | ||
const int i = | ||
|
@@ -67,17 +68,17 @@ inline void ModulatedDeformableCol2imCPUKernel( | |
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; | ||
const int data_mask_hw_ptr = | ||
((i * kernel_w + j) * height_col + h_out) * width_col + w_out; | ||
const T offset_h = data_offset_ptr[data_offset_h_ptr]; | ||
const T offset_w = data_offset_ptr[data_offset_w_ptr]; | ||
const T cur_inv_h_data = h_in + i * dilation_h + offset_h; | ||
const T cur_inv_w_data = w_in + j * dilation_w + offset_w; | ||
const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]); | ||
const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]); | ||
const MT cur_inv_h_data = h_in + i * dilation_h + offset_h; | ||
const MT cur_inv_w_data = w_in + j * dilation_w + offset_w; | ||
|
||
T cur_top_grad = data_col[thread]; | ||
MT cur_top_grad = static_cast<MT>(data_col[thread]); | ||
if (data_mask) { | ||
const T* data_mask_ptr = | ||
data_mask + (b * deformable_group + deformable_group_index) * | ||
kernel_h * kernel_w * height_col * width_col; | ||
const T mask = data_mask_ptr[data_mask_hw_ptr]; | ||
const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]); | ||
cur_top_grad *= mask; | ||
} | ||
const int cur_h = static_cast<int>(cur_inv_h_data); | ||
|
@@ -89,22 +90,23 @@ inline void ModulatedDeformableCol2imCPUKernel( | |
abs(cur_inv_w_data - (cur_w + dx)) < 1) { | ||
int cur_bottom_grad_pos = | ||
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; | ||
T weight = DmcnGetGradientWeight(cur_inv_h_data, | ||
cur_inv_w_data, | ||
cur_h + dy, | ||
cur_w + dx, | ||
height, | ||
width); | ||
MT weight = DmcnGetGradientWeight(cur_inv_h_data, | ||
cur_inv_w_data, | ||
cur_h + dy, | ||
cur_w + dx, | ||
height, | ||
width); | ||
|
||
*(grad_im + cur_bottom_grad_pos) = | ||
*(grad_im + cur_bottom_grad_pos) + weight * cur_top_grad; | ||
*(grad_im + cur_bottom_grad_pos) + | ||
static_cast<T>(weight * cur_top_grad); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
template <typename T, typename Context> | ||
template <typename T, typename MT, typename Context> | ||
void ModulatedDeformableCol2im(const Context& dev_ctx, | ||
const T* data_col, | ||
const T* data_offset, | ||
|
@@ -116,7 +118,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx, | |
const std::vector<int>& stride, | ||
const std::vector<int>& dilation, | ||
const int deformable_group, | ||
T* grad_im) { | ||
MT* grad_im) { | ||
int channel_per_deformable_group = im_shape[0] / deformable_group; | ||
int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; | ||
|
||
|
@@ -169,8 +171,9 @@ void ModulatedDeformableCol2imCoordCPUKernel( | |
const int width_col, | ||
T* grad_offset, | ||
T* grad_mask) { | ||
using MT = typename phi::dtype::MPTypeTrait<T>::Type; | ||
for (int i = 0; i < num_kernels; i++) { | ||
T val = 0, mval = 0; | ||
MT val = 0, mval = 0; | ||
const int w = i % width_col; | ||
const int h = (i / width_col) % height_col; | ||
const int c = (i / width_col / height_col) % offset_channels; | ||
|
@@ -215,40 +218,41 @@ void ModulatedDeformableCol2imCoordCPUKernel( | |
const int data_offset_w_ptr = | ||
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + | ||
w_out); | ||
const T offset_h = data_offset_ptr[data_offset_h_ptr]; | ||
const T offset_w = data_offset_ptr[data_offset_w_ptr]; | ||
T inv_h = h_in + i * dilation_h + offset_h; | ||
T inv_w = w_in + j * dilation_w + offset_w; | ||
const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]); | ||
const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]); | ||
MT inv_h = h_in + i * dilation_h + offset_h; | ||
MT inv_w = w_in + j * dilation_w + offset_w; | ||
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { | ||
inv_h = inv_w = -2; | ||
} else { | ||
mval += data_col_ptr[col_pos] * | ||
funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, | ||
width, | ||
height, | ||
width, | ||
inv_h, | ||
inv_w); | ||
mval += | ||
static_cast<MT>(data_col_ptr[col_pos]) * | ||
funcs::DmcnIm2colBilinear<T, MT>(data_im_ptr + cnt * height * width, | ||
width, | ||
height, | ||
width, | ||
inv_h, | ||
inv_w); | ||
} | ||
const T weight = | ||
DmcnGetCoordinateWeight(inv_h, | ||
inv_w, | ||
height, | ||
width, | ||
data_im_ptr + cnt * height * width, | ||
width, | ||
bp_dir); | ||
const MT weight = | ||
DmcnGetCoordinateWeight<T, MT>(inv_h, | ||
inv_w, | ||
height, | ||
width, | ||
data_im_ptr + cnt * height * width, | ||
width, | ||
bp_dir); | ||
if (data_mask_ptr) { | ||
const int data_mask_hw_ptr = | ||
(((i * kernel_w + j) * height_col + h_out) * width_col + w_out); | ||
const T mask = data_mask_ptr[data_mask_hw_ptr]; | ||
val += weight * data_col_ptr[col_pos] * mask; | ||
const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]); | ||
val += weight * static_cast<MT>(data_col_ptr[col_pos]) * mask; | ||
} else { | ||
val += weight * data_col_ptr[col_pos]; | ||
val += weight * static_cast<MT>(data_col_ptr[col_pos]); | ||
} | ||
cnt += 1; | ||
} | ||
grad_offset[i] = val; | ||
grad_offset[i] = static_cast<T>(val); | ||
if (grad_mask && offset_c % 2 == 0) | ||
grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * | ||
kernel_w + | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
不要求支持CPU的FP16 Kernel,因此这个文件暂时不要修改。
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
有的函数由于cc和cu文件共用,就在cpu的文件中增加了少量代码以适配。