From 467d580dde745ebe572eb38db795748b0407111c Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Thu, 7 Mar 2024 12:24:38 +0000 Subject: [PATCH 01/16] improve the performence of divide double grad --- .../impl/elementwise_grad_kernel_impl.h | 186 +++++++++++------- 1 file changed, 116 insertions(+), 70 deletions(-) diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index db6858bc9d7d7..0641c2601a1d3 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -173,26 +173,8 @@ void DivideDoubleGradKernel(const Context& dev_ctx, DenseTensor* dy, DenseTensor* dout, DenseTensor* ddout) { - if (dy) { - dy->Resize(y.dims()); - dev_ctx.template Alloc(dy); - } - if (dout) { - dout->Resize(out.dims()); - dev_ctx.template Alloc(dout); - } - if (ddout) { - ddout->Resize(out.dims()); - dev_ctx.template Alloc(ddout); - } - // ddX_safe == null ? 0 : ddX - // ddY_safe == null ? 0 : ddY - DenseTensor ddX_safe, ddY_safe; - phi::funcs::GetDoubleGradSafeTensor( - dev_ctx, dx, ddx.get_ptr(), &ddX_safe); - phi::funcs::GetDoubleGradSafeTensor( - dev_ctx, y, ddy.get_ptr(), &ddY_safe); - + auto* ddx_tensor = ddx.get_ptr(); + auto* ddy_tensor = ddy.get_ptr(); // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y // dY = Out * dX * ddY / Y - dX * ddX / Y // dOut = - dX * ddY @@ -206,63 +188,127 @@ void DivideDoubleGradKernel(const Context& dev_ctx, dev_ctx.template Alloc(&tmp); } if (dy) { - // dX_div_Y = dX / Y; - DenseTensor dX_div_Y = tmp; - funcs::DefaultElementwiseOperator, - funcs::InverseDivideFunctor>( - dev_ctx, dx, y, &dX_div_Y, axis); - - // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the - // first output tensor is nullptr, the branch to calculate first - // output tensor will not be activated, DivGradDx function will not - // be called and can be ignored, the first branch has little effect - // on running speed. - - // dY = Out * dX * ddY / Y - dX * ddX / Y - phi::funcs::ElemwiseGradCompute, DivDoubleDY>( - dev_ctx, - ddX_safe, - ddY_safe, - out, - dX_div_Y, - axis, - nullptr, - dy, - DivGradDX(), - DivDoubleDY()); + dy->Resize(y.dims()); + dev_ctx.template Alloc(dy); + if (ddx_tensor == nullptr && ddy_tensor == nullptr) { + dy = nullptr; + } else { + DenseTensor tmp_dy = tmp; + // dX / Y + funcs::DefaultElementwiseOperator, + funcs::InverseDivideFunctor>( + dev_ctx, dx, y, &tmp_dy, axis); + if (ddx_tensor != nullptr && ddy_tensor == nullptr) { + // dy = -dX * ddX / Y + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, *ddx_tensor, tmp_dy, dy, axis); + auto& place = *dev_ctx.eigen_device(); + auto dy_result = phi::EigenVector::Flatten(*dy); + dy_result.device(place) = static_cast(-1) * dy_result; + } else if (ddx_tensor == nullptr && ddy_tensor != nullptr) { + // dY = Out * dX * ddY / Y + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, *ddy_tensor, tmp_dy, &tmp_dy, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, out, tmp_dy, dy, axis); + } else { + // dY = Out * dX * ddY / Y - dX * ddX / Y + + // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the + // first output tensor is nullptr, the branch to calculate first + // output tensor will not be activated, DivGradDx function will not + // be called and can be ignored, the first branch has little effect + // on running speed. + phi::funcs:: + ElemwiseGradCompute, DivDoubleDY>( + dev_ctx, + *ddx_tensor, + *ddy_tensor, + out, + tmp_dy, + axis, + nullptr, + dy, + DivGradDX(), + DivDoubleDY()); + } + } } if (ddout) { + ddout->Resize(out.dims()); + dev_ctx.template Alloc(ddout); // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y - funcs::DefaultElementwiseOperator, - funcs::InverseMultiplyFunctor>( - dev_ctx, out, ddY_safe, &tmp, axis); - funcs::DefaultElementwiseOperator, - funcs::InverseSubtractFunctor>( - dev_ctx, ddX_safe, tmp, &tmp, axis); - funcs::DefaultElementwiseOperator, - funcs::InverseDivideFunctor>( - dev_ctx, tmp, y, ddout, axis); + if (ddx_tensor == nullptr && ddy_tensor == nullptr) { + ddout = nullptr; + } else if (ddx_tensor != nullptr && ddy_tensor == nullptr) { + // ddOut = ddX / Y + funcs::DefaultElementwiseOperator, + funcs::InverseDivideFunctor>( + dev_ctx, *ddx_tensor, y, ddout, axis); + } else if (ddx_tensor == nullptr && ddy_tensor != nullptr) { + // ddOut = - Out * ddY / Y + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, out, *ddy_tensor, &tmp, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseDivideFunctor>( + dev_ctx, tmp, y, ddout, axis); + auto& place = *dev_ctx.eigen_device(); + auto ddout_result = phi::EigenVector::Flatten(*ddout); + ddout_result.device(place) = static_cast(-1) * ddout_result; + } else { + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, out, *ddy_tensor, &tmp, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseSubtractFunctor>( + dev_ctx, *ddx_tensor, tmp, &tmp, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseDivideFunctor>( + dev_ctx, tmp, y, ddout, axis); + } } if (dout) { - // dOut = - dX * ddY - funcs::DefaultElementwiseOperator, - funcs::InverseMultiplyFunctor>( - dev_ctx, dx, ddY_safe, dout, axis); - auto& place = *dev_ctx.eigen_device(); - auto dout_result = phi::EigenVector::Flatten(*dout); - dout_result.device(place) = static_cast(-1) * dout_result; + dout->Resize(out.dims()); + dev_ctx.template Alloc(dout); + if (ddy_tensor == nullptr) { + dout = nullptr; + } else { + // dOut = - dX * ddY + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, dx, *ddy_tensor, dout, axis); + auto& place = *dev_ctx.eigen_device(); + auto dout_result = phi::EigenVector::Flatten(*dout); + dout_result.device(place) = static_cast(-1) * dout_result; + } } } template From 5563d5ee2e6a89c573bb6d8e6953bf27ef0aea7f Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Thu, 7 Mar 2024 16:05:52 +0000 Subject: [PATCH 02/16] fix infermeta --- paddle/phi/api/yaml/legacy_backward.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index e5529aa6c5efa..1ee9a29d7e644 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -179,7 +179,7 @@ output : Tensor(y_grad), Tensor(out_grad), Tensor(grad_out_grad) infer_meta : func : GeneralTernaryGradInferMeta - param : [y, grad_x, grad_x] + param : [y, out, out] kernel : func : divide_double_grad data_type : out From 2683315e88e32e3ad98ec3a59e3d63544344166f Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Sat, 9 Mar 2024 04:38:17 +0000 Subject: [PATCH 03/16] update --- .../ops_signature/elementwise_sig.cc | 2 +- .../pir/dialect/operator/ir/ops_backward.yaml | 6 ++-- paddle/phi/api/yaml/legacy_backward.yaml | 4 +-- .../kernels/elementwise_divide_grad_kernel.h | 3 +- .../impl/elementwise_grad_kernel_impl.h | 30 +++++++++++++++++-- 5 files changed, 35 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/ops_signature/elementwise_sig.cc b/paddle/fluid/operators/ops_signature/elementwise_sig.cc index b1150268fbad1..82f891bb48a00 100644 --- a/paddle/fluid/operators/ops_signature/elementwise_sig.cc +++ b/paddle/fluid/operators/ops_signature/elementwise_sig.cc @@ -168,7 +168,7 @@ KernelSignature ElementwiseDivGradOpArgumentMapping( KernelSignature ElementwiseDivDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx UNUSED) { return KernelSignature("divide_double_grad", - {"Y", "Out", "DX", "DDX", "DDY"}, + {"Y", "Out", "Out@GRAD", "DX", "DDX", "DDY"}, {"axis"}, {"Y@GRAD", "DOut", "DDOut"}); } diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml index 7b3068a8ab6c9..9f115d7744650 100644 --- a/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml +++ b/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml @@ -190,15 +190,15 @@ - backward_op : divide_double_grad forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y) - args : (Tensor y, Tensor out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) + args : (Tensor y, Tensor out, Tensor grad_out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) output : Tensor(y_grad), Tensor(out_grad), Tensor(grad_out_grad) infer_meta : func : GeneralTernaryGradInferMeta - param : [y, grad_x, grad_x] + param : [y, out, out] kernel : func : divide_double_grad data_type : out - optional : grad_x_grad, grad_y_grad + optional : grad_x, grad_x_grad, grad_y_grad inplace : (grad_x_grad -> grad_out_grad) - backward_op : divide_grad diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 1ee9a29d7e644..a22a25eeda48a 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -175,7 +175,7 @@ - backward_op : divide_double_grad forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y) - args : (Tensor y, Tensor out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) + args : (Tensor y, Tensor out, Tensor grad_out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) output : Tensor(y_grad), Tensor(out_grad), Tensor(grad_out_grad) infer_meta : func : GeneralTernaryGradInferMeta @@ -183,7 +183,7 @@ kernel : func : divide_double_grad data_type : out - optional : grad_x_grad, grad_y_grad + optional : grad_x, grad_x_grad, grad_y_grad inplace : (grad_x_grad -> grad_out_grad) - backward_op : divide_grad diff --git a/paddle/phi/kernels/elementwise_divide_grad_kernel.h b/paddle/phi/kernels/elementwise_divide_grad_kernel.h index c764f05c3983f..15b1e65a9cfdf 100644 --- a/paddle/phi/kernels/elementwise_divide_grad_kernel.h +++ b/paddle/phi/kernels/elementwise_divide_grad_kernel.h @@ -33,7 +33,8 @@ template void DivideDoubleGradKernel(const Context& dev_ctx, const DenseTensor& y, const DenseTensor& out, - const DenseTensor& dx, + const DenseTensor& grad_out, + const paddle::optional& dx, const paddle::optional& ddx, const paddle::optional& ddy, int axis, diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index 0641c2601a1d3..d6d9bb1c726cc 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -166,7 +166,8 @@ template void DivideDoubleGradKernel(const Context& dev_ctx, const DenseTensor& y, const DenseTensor& out, - const DenseTensor& dx, + const DenseTensor& grad_out, + const paddle::optional& dx, const paddle::optional& ddx, const paddle::optional& ddy, int axis, @@ -175,6 +176,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx, DenseTensor* ddout) { auto* ddx_tensor = ddx.get_ptr(); auto* ddy_tensor = ddy.get_ptr(); + auto* dx_tensor = dx.get_ptr(); // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y // dY = Out * dX * ddY / Y - dX * ddX / Y // dOut = - dX * ddY @@ -193,13 +195,24 @@ void DivideDoubleGradKernel(const Context& dev_ctx, if (ddx_tensor == nullptr && ddy_tensor == nullptr) { dy = nullptr; } else { + if (dx_tensor == nullptr || dx_tensor->dims() != out.dims()) { + DenseTensor dz_div_y; + dz_div_y.Resize(out.dims()); + dev_ctx.template Alloc(&dz_div_y); + funcs::DefaultElementwiseOperator, + funcs::InverseDivideFunctor>( + dev_ctx, grad_out, y, &dz_div_y, axis); + dx_tensor = &dz_div_y; + } DenseTensor tmp_dy = tmp; // dX / Y funcs::DefaultElementwiseOperator, funcs::InverseDivideFunctor>( - dev_ctx, dx, y, &tmp_dy, axis); + dev_ctx, *dx_tensor, y, &tmp_dy, axis); if (ddx_tensor != nullptr && ddy_tensor == nullptr) { // dy = -dX * ddX / Y funcs::DefaultElementwiseOperatordims() != out.dims()) { + DenseTensor dz_div_y; + dz_div_y.Resize(out.dims()); + dev_ctx.template Alloc(&dz_div_y); + funcs::DefaultElementwiseOperator, + funcs::InverseDivideFunctor>( + dev_ctx, grad_out, y, &dz_div_y, axis); + dx_tensor = &dz_div_y; + } // dOut = - dX * ddY funcs::DefaultElementwiseOperator, funcs::InverseMultiplyFunctor>( - dev_ctx, dx, *ddy_tensor, dout, axis); + dev_ctx, *dx_tensor, *ddy_tensor, dout, axis); auto& place = *dev_ctx.eigen_device(); auto dout_result = phi::EigenVector::Flatten(*dout); dout_result.device(place) = static_cast(-1) * dout_result; From f28f02b7790d254c1660386ce4dcc9f5ef471e42 Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Mon, 11 Mar 2024 11:44:13 +0000 Subject: [PATCH 04/16] fix some bug --- .../impl/elementwise_grad_kernel_impl.h | 33 +++++++------------ 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index d6d9bb1c726cc..e1595df42bbfe 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -177,6 +177,17 @@ void DivideDoubleGradKernel(const Context& dev_ctx, auto* ddx_tensor = ddx.get_ptr(); auto* ddy_tensor = ddy.get_ptr(); auto* dx_tensor = dx.get_ptr(); + DenseTensor dz_div_y; + dz_div_y.Resize(out.dims()); + if (dx_tensor == nullptr || dx_tensor->dims() != out.dims()) { + dev_ctx.template Alloc(&dz_div_y); + funcs::DefaultElementwiseOperator, + funcs::InverseDivideFunctor>( + dev_ctx, grad_out, y, &dz_div_y, axis); + dx_tensor = &dz_div_y; + } // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y // dY = Out * dX * ddY / Y - dX * ddX / Y // dOut = - dX * ddY @@ -195,17 +206,6 @@ void DivideDoubleGradKernel(const Context& dev_ctx, if (ddx_tensor == nullptr && ddy_tensor == nullptr) { dy = nullptr; } else { - if (dx_tensor == nullptr || dx_tensor->dims() != out.dims()) { - DenseTensor dz_div_y; - dz_div_y.Resize(out.dims()); - dev_ctx.template Alloc(&dz_div_y); - funcs::DefaultElementwiseOperator, - funcs::InverseDivideFunctor>( - dev_ctx, grad_out, y, &dz_div_y, axis); - dx_tensor = &dz_div_y; - } DenseTensor tmp_dy = tmp; // dX / Y funcs::DefaultElementwiseOperatordims() != out.dims()) { - DenseTensor dz_div_y; - dz_div_y.Resize(out.dims()); - dev_ctx.template Alloc(&dz_div_y); - funcs::DefaultElementwiseOperator, - funcs::InverseDivideFunctor>( - dev_ctx, grad_out, y, &dz_div_y, axis); - dx_tensor = &dz_div_y; - } // dOut = - dX * ddY funcs::DefaultElementwiseOperator Date: Tue, 12 Mar 2024 12:40:31 +0000 Subject: [PATCH 05/16] fix bug and update test --- .../impl/elementwise_grad_kernel_impl.h | 20 +++++++++---------- .../test_elementwise_div_grad_grad.cc | 12 ++++++----- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index e1595df42bbfe..06ab33267e040 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -179,7 +179,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx, auto* dx_tensor = dx.get_ptr(); DenseTensor dz_div_y; dz_div_y.Resize(out.dims()); - if (dx_tensor == nullptr || dx_tensor->dims() != out.dims()) { + if (!dx_tensor || dx_tensor->dims() != out.dims()) { dev_ctx.template Alloc(&dz_div_y); funcs::DefaultElementwiseOperatorResize(y.dims()); dev_ctx.template Alloc(dy); - if (ddx_tensor == nullptr && ddy_tensor == nullptr) { - dy = nullptr; + if (!ddx_tensor && !ddy_tensor) { + FullLikeKernel(dev_ctx, y, Scalar(0.0), y.dtype(), dy); } else { DenseTensor tmp_dy = tmp; // dX / Y @@ -213,7 +213,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx, funcs::DivideFunctor, funcs::InverseDivideFunctor>( dev_ctx, *dx_tensor, y, &tmp_dy, axis); - if (ddx_tensor != nullptr && ddy_tensor == nullptr) { + if (ddx_tensor && !ddy_tensor) { // dy = -dX * ddX / Y funcs::DefaultElementwiseOperator::Flatten(*dy); dy_result.device(place) = static_cast(-1) * dy_result; - } else if (ddx_tensor == nullptr && ddy_tensor != nullptr) { + } else if (!ddx_tensor && ddy_tensor) { // dY = Out * dX * ddY / Y funcs::DefaultElementwiseOperatorResize(out.dims()); dev_ctx.template Alloc(ddout); // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y - if (ddx_tensor == nullptr && ddy_tensor == nullptr) { - ddout = nullptr; + if (!ddx_tensor && !ddy_tensor) { + FullLikeKernel(dev_ctx, out, Scalar(0.0), out.dtype(), ddout); } else if (ddx_tensor != nullptr && ddy_tensor == nullptr) { // ddOut = ddX / Y funcs::DefaultElementwiseOperator, funcs::InverseDivideFunctor>( dev_ctx, *ddx_tensor, y, ddout, axis); - } else if (ddx_tensor == nullptr && ddy_tensor != nullptr) { + } else if (!ddx_tensor && ddy_tensor) { // ddOut = - Out * ddY / Y funcs::DefaultElementwiseOperatorResize(out.dims()); dev_ctx.template Alloc(dout); - if (ddy_tensor == nullptr) { - dout = nullptr; + if (!ddy_tensor) { + FullLikeKernel(dev_ctx, out, Scalar(0.0), out.dtype(), dout); } else { // dOut = - dX * ddY funcs::DefaultElementwiseOperator("elementwise_div_grad_grad", - place, - dims, - {"Y", "Out", "DDX", "DDY", "DX"}, - {"Y@GRAD", "DDOut"}) {} + : TestElementwiseOpGradGrad( + "elementwise_div_grad_grad", + place, + dims, + {"Y", "Out", "Out@GRAD", "DDX", "DDY", "DX"}, + {"Y@GRAD", "DDOut"}) {} using TestElementwiseOpGradGrad::feed_datas_; using TestElementwiseOpGradGrad::expected_outs_; @@ -78,6 +79,7 @@ class TestElementwiseDivGradGradWithoutDout this->op_type_, {{"Y", {"Y"}}, {"Out", {"Out"}}, + {"Out@GRAD", {"Out@GRAD"}}, {"DDX", {"DDX"}}, {"DDY", {"DDY"}}, {"DX", {"DX"}}}, From fd22057481e58cf48b8bab658fe1c5beedd0ca64 Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Wed, 13 Mar 2024 13:08:02 +0000 Subject: [PATCH 06/16] update --- .../impl/elementwise_grad_kernel_impl.h | 204 +++++++++++++----- 1 file changed, 150 insertions(+), 54 deletions(-) diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index 06ab33267e040..75e925ff95b0b 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -158,7 +158,38 @@ struct DivGradDY> { template struct DivDoubleDY { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return y * out * dout - x * dout; + return (y * out - x) * dout; + } +}; + +template +struct DivDoubleDY_Only_DDX { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -x * dout; } +}; + +template +struct DivDoubleDY_Only_DDY { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return y * out * dout; + } +}; + +template +struct DivDoubleDDOut { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return (x - out * y) / dout; + } +}; + +template +struct DivDoubleDDOut_Only_DDX { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return x / dout; } +}; + +template +struct DivDoubleDDOut_Only_DDY { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return -out * y / dout; } }; @@ -179,7 +210,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx, auto* dx_tensor = dx.get_ptr(); DenseTensor dz_div_y; dz_div_y.Resize(out.dims()); - if (!dx_tensor || dx_tensor->dims() != out.dims()) { + if ((dy || dout) && (!dx_tensor || dx_tensor->dims() != out.dims())) { dev_ctx.template Alloc(&dz_div_y); funcs::DefaultElementwiseOperator(dev_ctx, y, Scalar(0.0), y.dtype(), dy); } else { - DenseTensor tmp_dy = tmp; + // DenseTensor tmp_dy = tmp; // dX / Y funcs::DefaultElementwiseOperator, funcs::InverseDivideFunctor>( - dev_ctx, *dx_tensor, y, &tmp_dy, axis); + dev_ctx, *dx_tensor, y, &tmp, axis); if (ddx_tensor && !ddy_tensor) { // dy = -dX * ddX / Y - funcs::DefaultElementwiseOperator, - funcs::InverseMultiplyFunctor>( - dev_ctx, *ddx_tensor, tmp_dy, dy, axis); - auto& place = *dev_ctx.eigen_device(); - auto dy_result = phi::EigenVector::Flatten(*dy); - dy_result.device(place) = static_cast(-1) * dy_result; + // funcs::DefaultElementwiseOperator, + // funcs::InverseMultiplyFunctor>( + // dev_ctx, *ddx_tensor, tmp, dy, axis); + // auto& place = *dev_ctx.eigen_device(); + // auto dy_result = phi::EigenVector::Flatten(*dy); + // dy_result.device(place) = static_cast(-1) * dy_result; + + phi::funcs::ElemwiseGradCompute, + DivDoubleDY_Only_DDX>( + dev_ctx, + *ddx_tensor, // ddx + y, + out, // out + tmp, // dX /Y + axis, + nullptr, + dy, + DivGradDX(), + DivDoubleDY_Only_DDX()); } else if (!ddx_tensor && ddy_tensor) { // dY = Out * dX * ddY / Y - funcs::DefaultElementwiseOperator, - funcs::InverseMultiplyFunctor>( - dev_ctx, *ddy_tensor, tmp_dy, &tmp_dy, axis); - funcs::DefaultElementwiseOperator, - funcs::InverseMultiplyFunctor>( - dev_ctx, out, tmp_dy, dy, axis); + // VLOG(4) << "1"; + // funcs::DefaultElementwiseOperator, + // funcs::InverseMultiplyFunctor>( + // dev_ctx, *ddy_tensor, tmp, &tmp, axis); + // // VLOG(4) << "2"; + // funcs::DefaultElementwiseOperator, + // funcs::InverseMultiplyFunctor>( + // dev_ctx, out, tmp, dy, axis); + // VLOG(4) << "3"; + phi::funcs::ElemwiseGradCompute, + DivDoubleDY_Only_DDY>( + dev_ctx, + out, + *ddy_tensor, // ddy + out, // out + tmp, // dX / Y + axis, + nullptr, + dy, + DivGradDX(), + DivDoubleDY_Only_DDY()); } else { // dY = Out * dX * ddY / Y - dX * ddX / Y @@ -246,10 +309,10 @@ void DivideDoubleGradKernel(const Context& dev_ctx, phi::funcs:: ElemwiseGradCompute, DivDoubleDY>( dev_ctx, - *ddx_tensor, - *ddy_tensor, - out, - tmp_dy, + *ddx_tensor, // ddx + *ddy_tensor, // ddy + out, // out + tmp, // dX / Y axis, nullptr, dy, @@ -273,36 +336,67 @@ void DivideDoubleGradKernel(const Context& dev_ctx, funcs::InverseDivideFunctor>( dev_ctx, *ddx_tensor, y, ddout, axis); } else if (!ddx_tensor && ddy_tensor) { + // VLOG(4) << "4"; // ddOut = - Out * ddY / Y - funcs::DefaultElementwiseOperator, - funcs::InverseMultiplyFunctor>( - dev_ctx, out, *ddy_tensor, &tmp, axis); - funcs::DefaultElementwiseOperator, - funcs::InverseDivideFunctor>( - dev_ctx, tmp, y, ddout, axis); - auto& place = *dev_ctx.eigen_device(); - auto ddout_result = phi::EigenVector::Flatten(*ddout); - ddout_result.device(place) = static_cast(-1) * ddout_result; + // funcs::DefaultElementwiseOperator, + // funcs::InverseMultiplyFunctor>( + // dev_ctx, out, *ddy_tensor, &tmp, axis); + // // VLOG(4) << "5"; + // funcs::DefaultElementwiseOperator, + // funcs::InverseDivideFunctor>( + // dev_ctx, tmp, y, ddout, axis); + // auto& place = *dev_ctx.eigen_device(); + // auto ddout_result = phi::EigenVector::Flatten(*ddout); + // // VLOG(4) << "6"; + // ddout_result.device(place) = static_cast(-1) * ddout_result; + phi::funcs::ElemwiseGradCompute, + DivDoubleDDOut_Only_DDY>( + dev_ctx, + out, + *ddy_tensor, // ddy + out, // out + y, // Y + axis, + nullptr, + dy, + DivGradDX(), + DivDoubleDDOut_Only_DDY()); } else { - funcs::DefaultElementwiseOperator, - funcs::InverseMultiplyFunctor>( - dev_ctx, out, *ddy_tensor, &tmp, axis); - funcs::DefaultElementwiseOperator, - funcs::InverseSubtractFunctor>( - dev_ctx, *ddx_tensor, tmp, &tmp, axis); - funcs::DefaultElementwiseOperator, - funcs::InverseDivideFunctor>( - dev_ctx, tmp, y, ddout, axis); + // funcs::DefaultElementwiseOperator, + // funcs::InverseMultiplyFunctor>( + // dev_ctx, out, *ddy_tensor, &tmp, axis); + // funcs::DefaultElementwiseOperator, + // funcs::InverseSubtractFunctor>( + // dev_ctx, *ddx_tensor, tmp, &tmp, axis); + // funcs::DefaultElementwiseOperator, + // funcs::InverseDivideFunctor>( + // dev_ctx, tmp, y, ddout, axis); + phi::funcs::ElemwiseGradCompute, + DivDoubleDDOut_Only_DDX>( + dev_ctx, + *ddx_tensor, + y, // ddy + out, // out + y, // Y + axis, + nullptr, + dy, + DivGradDX(), + DivDoubleDDOut_Only_DDX()); } } @@ -313,6 +407,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx, FullLikeKernel(dev_ctx, out, Scalar(0.0), out.dtype(), dout); } else { // dOut = - dX * ddY + // VLOG(4) << "7"; funcs::DefaultElementwiseOperator, @@ -321,6 +416,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx, auto& place = *dev_ctx.eigen_device(); auto dout_result = phi::EigenVector::Flatten(*dout); dout_result.device(place) = static_cast(-1) * dout_result; + // VLOG(4) << "8"; } } } From ef8f29d99ab5b968a03dae335538aae5cdf4757e Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Sat, 16 Mar 2024 08:35:41 +0000 Subject: [PATCH 07/16] fix bug --- .../elementwise/elementwise_div_op.cc | 1 + .../impl/elementwise_grad_kernel_impl.h | 130 ++++++++++-------- 2 files changed, 72 insertions(+), 59 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc index 191890865fb89..bcd785293c71e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc @@ -107,6 +107,7 @@ class ElementwiseDivDoubleGradMaker : public framework::SingleGradOpMaker { op->SetType("elementwise_div_grad_grad"); op->SetInput("Y", this->Input("Y")); op->SetInput("Out", this->Input("Out")); + op->SetInput("Out@GRAD", this->Input("Out@GRAD")); op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); op->SetInput("DDY", this->OutputGrad(framework::GradVarName("Y"))); op->SetInput("DX", this->Output(framework::GradVarName("X"))); diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index 75e925ff95b0b..bd5c72c02e98b 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -205,6 +205,18 @@ void DivideDoubleGradKernel(const Context& dev_ctx, DenseTensor* dy, DenseTensor* dout, DenseTensor* ddout) { + if (dy) { + dy->Resize(y.dims()); + dev_ctx.template Alloc(dy); + } + if (dout) { + dout->Resize(out.dims()); + dev_ctx.template Alloc(dout); + } + if (ddout) { + ddout->Resize(out.dims()); + dev_ctx.template Alloc(ddout); + } auto* ddx_tensor = ddx.get_ptr(); auto* ddy_tensor = ddy.get_ptr(); auto* dx_tensor = dx.get_ptr(); @@ -289,7 +301,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx, DivGradDX, DivDoubleDY_Only_DDY>( dev_ctx, - out, + *dx_tensor, *ddy_tensor, // ddy out, // out tmp, // dX / Y @@ -338,65 +350,65 @@ void DivideDoubleGradKernel(const Context& dev_ctx, } else if (!ddx_tensor && ddy_tensor) { // VLOG(4) << "4"; // ddOut = - Out * ddY / Y - // funcs::DefaultElementwiseOperator, - // funcs::InverseMultiplyFunctor>( - // dev_ctx, out, *ddy_tensor, &tmp, axis); - // // VLOG(4) << "5"; - // funcs::DefaultElementwiseOperator, - // funcs::InverseDivideFunctor>( - // dev_ctx, tmp, y, ddout, axis); - // auto& place = *dev_ctx.eigen_device(); - // auto ddout_result = phi::EigenVector::Flatten(*ddout); - // // VLOG(4) << "6"; - // ddout_result.device(place) = static_cast(-1) * ddout_result; - phi::funcs::ElemwiseGradCompute, - DivDoubleDDOut_Only_DDY>( - dev_ctx, - out, - *ddy_tensor, // ddy - out, // out - y, // Y - axis, - nullptr, - dy, - DivGradDX(), - DivDoubleDDOut_Only_DDY()); + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, out, *ddy_tensor, &tmp, axis); + // VLOG(4) << "5"; + funcs::DefaultElementwiseOperator, + funcs::InverseDivideFunctor>( + dev_ctx, tmp, y, ddout, axis); + auto& place = *dev_ctx.eigen_device(); + auto ddout_result = phi::EigenVector::Flatten(*ddout); + // VLOG(4) << "6"; + ddout_result.device(place) = static_cast(-1) * ddout_result; + // phi::funcs::ElemwiseGradCompute, + // DivDoubleDDOut_Only_DDY>( + // dev_ctx, + // out, + // *ddy_tensor, // ddy + // out, // out + // y, // Y + // axis, + // nullptr, + // ddout, + // DivGradDX(), + // DivDoubleDDOut_Only_DDY()); } else { - // funcs::DefaultElementwiseOperator, - // funcs::InverseMultiplyFunctor>( - // dev_ctx, out, *ddy_tensor, &tmp, axis); - // funcs::DefaultElementwiseOperator, - // funcs::InverseSubtractFunctor>( - // dev_ctx, *ddx_tensor, tmp, &tmp, axis); - // funcs::DefaultElementwiseOperator, - // funcs::InverseDivideFunctor>( - // dev_ctx, tmp, y, ddout, axis); - phi::funcs::ElemwiseGradCompute, - DivDoubleDDOut_Only_DDX>( - dev_ctx, - *ddx_tensor, - y, // ddy - out, // out - y, // Y - axis, - nullptr, - dy, - DivGradDX(), - DivDoubleDDOut_Only_DDX()); + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, out, *ddy_tensor, &tmp, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseSubtractFunctor>( + dev_ctx, *ddx_tensor, tmp, &tmp, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseDivideFunctor>( + dev_ctx, tmp, y, ddout, axis); + // phi::funcs::ElemwiseGradCompute, + // DivDoubleDDOut>( + // dev_ctx, + // *ddx_tensor, + // *ddy_tensor, // ddy + // out, // out + // y, // Y + // axis, + // nullptr, + // ddout, + // DivGradDX(), + // DivDoubleDDOut()); } } From 96c6cc6e193f09467482643bcae12dd5d8bfc69e Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Sat, 16 Mar 2024 09:06:36 +0000 Subject: [PATCH 08/16] update --- .../impl/elementwise_grad_kernel_impl.h | 118 ++++-------------- 1 file changed, 27 insertions(+), 91 deletions(-) diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index bd5c72c02e98b..15f15ac9a38bc 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -174,24 +174,24 @@ struct DivDoubleDY_Only_DDY { } }; -template -struct DivDoubleDDOut { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return (x - out * y) / dout; - } -}; - -template -struct DivDoubleDDOut_Only_DDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return x / dout; } -}; - -template -struct DivDoubleDDOut_Only_DDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return -out * y / dout; - } -}; +// template +// struct DivDoubleDDOut { +// HOSTDEVICE T operator()(T x, T y, T out, T dout) const { +// return (x - out * y) / dout; +// } +// }; + +// template +// struct DivDoubleDDOut_Only_DDX { +// HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return x / dout; } +// }; + +// template +// struct DivDoubleDDOut_Only_DDY { +// HOSTDEVICE T operator()(T x, T y, T out, T dout) const { +// return -out * y / dout; +// } +// }; template void DivideDoubleGradKernel(const Context& dev_ctx, @@ -205,18 +205,6 @@ void DivideDoubleGradKernel(const Context& dev_ctx, DenseTensor* dy, DenseTensor* dout, DenseTensor* ddout) { - if (dy) { - dy->Resize(y.dims()); - dev_ctx.template Alloc(dy); - } - if (dout) { - dout->Resize(out.dims()); - dev_ctx.template Alloc(dout); - } - if (ddout) { - ddout->Resize(out.dims()); - dev_ctx.template Alloc(ddout); - } auto* ddx_tensor = ddx.get_ptr(); auto* ddy_tensor = ddy.get_ptr(); auto* dx_tensor = dx.get_ptr(); @@ -238,6 +226,8 @@ void DivideDoubleGradKernel(const Context& dev_ctx, // inplace ddx DenseTensor tmp; if (dout) { + dout->Resize(out.dims()); + dev_ctx.template Alloc(dout); tmp = *dout; } else { tmp.Resize(out.dims()); @@ -247,10 +237,10 @@ void DivideDoubleGradKernel(const Context& dev_ctx, dy->Resize(y.dims()); dev_ctx.template Alloc(dy); if (!ddx_tensor && !ddy_tensor) { - FullLikeKernel(dev_ctx, y, Scalar(0.0), y.dtype(), dy); + FullLikeKernel( + dev_ctx, y, Scalar(static_cast(0.0)), y.dtype(), dy); } else { - // DenseTensor tmp_dy = tmp; - // dX / Y + // pre-compute 'dX / Y' into 'tmp' for 'ddout' and/or 'dy' funcs::DefaultElementwiseOperator, @@ -258,15 +248,6 @@ void DivideDoubleGradKernel(const Context& dev_ctx, dev_ctx, *dx_tensor, y, &tmp, axis); if (ddx_tensor && !ddy_tensor) { // dy = -dX * ddX / Y - // funcs::DefaultElementwiseOperator, - // funcs::InverseMultiplyFunctor>( - // dev_ctx, *ddx_tensor, tmp, dy, axis); - // auto& place = *dev_ctx.eigen_device(); - // auto dy_result = phi::EigenVector::Flatten(*dy); - // dy_result.device(place) = static_cast(-1) * dy_result; - phi::funcs::ElemwiseGradCompute, @@ -283,19 +264,6 @@ void DivideDoubleGradKernel(const Context& dev_ctx, DivDoubleDY_Only_DDX()); } else if (!ddx_tensor && ddy_tensor) { // dY = Out * dX * ddY / Y - // VLOG(4) << "1"; - // funcs::DefaultElementwiseOperator, - // funcs::InverseMultiplyFunctor>( - // dev_ctx, *ddy_tensor, tmp, &tmp, axis); - // // VLOG(4) << "2"; - // funcs::DefaultElementwiseOperator, - // funcs::InverseMultiplyFunctor>( - // dev_ctx, out, tmp, dy, axis); - // VLOG(4) << "3"; phi::funcs::ElemwiseGradCompute, @@ -339,7 +307,8 @@ void DivideDoubleGradKernel(const Context& dev_ctx, dev_ctx.template Alloc(ddout); // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y if (!ddx_tensor && !ddy_tensor) { - FullLikeKernel(dev_ctx, out, Scalar(0.0), out.dtype(), ddout); + FullLikeKernel( + dev_ctx, out, Scalar(static_cast(0.0)), out.dtype(), ddout); } else if (ddx_tensor != nullptr && ddy_tensor == nullptr) { // ddOut = ddX / Y funcs::DefaultElementwiseOperator>( dev_ctx, *ddx_tensor, y, ddout, axis); } else if (!ddx_tensor && ddy_tensor) { - // VLOG(4) << "4"; // ddOut = - Out * ddY / Y funcs::DefaultElementwiseOperator::Flatten(*ddout); - // VLOG(4) << "6"; ddout_result.device(place) = static_cast(-1) * ddout_result; - // phi::funcs::ElemwiseGradCompute, - // DivDoubleDDOut_Only_DDY>( - // dev_ctx, - // out, - // *ddy_tensor, // ddy - // out, // out - // y, // Y - // axis, - // nullptr, - // ddout, - // DivGradDX(), - // DivDoubleDDOut_Only_DDY()); } else { funcs::DefaultElementwiseOperator, funcs::InverseDivideFunctor>( dev_ctx, tmp, y, ddout, axis); - // phi::funcs::ElemwiseGradCompute, - // DivDoubleDDOut>( - // dev_ctx, - // *ddx_tensor, - // *ddy_tensor, // ddy - // out, // out - // y, // Y - // axis, - // nullptr, - // ddout, - // DivGradDX(), - // DivDoubleDDOut()); } } if (dout) { - dout->Resize(out.dims()); - dev_ctx.template Alloc(dout); if (!ddy_tensor) { - FullLikeKernel(dev_ctx, out, Scalar(0.0), out.dtype(), dout); + FullLikeKernel( + dev_ctx, out, Scalar(static_cast(0.0)), out.dtype(), dout); } else { // dOut = - dX * ddY - // VLOG(4) << "7"; funcs::DefaultElementwiseOperator, @@ -428,7 +365,6 @@ void DivideDoubleGradKernel(const Context& dev_ctx, auto& place = *dev_ctx.eigen_device(); auto dout_result = phi::EigenVector::Flatten(*dout); dout_result.device(place) = static_cast(-1) * dout_result; - // VLOG(4) << "8"; } } } From a74a2fa63baa89fcaab3ac7005b74b7eb1ea556e Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Sat, 16 Mar 2024 09:16:01 +0000 Subject: [PATCH 09/16] update --- paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index 15f15ac9a38bc..f0e42708073d5 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -209,8 +209,8 @@ void DivideDoubleGradKernel(const Context& dev_ctx, auto* ddy_tensor = ddy.get_ptr(); auto* dx_tensor = dx.get_ptr(); DenseTensor dz_div_y; - dz_div_y.Resize(out.dims()); if ((dy || dout) && (!dx_tensor || dx_tensor->dims() != out.dims())) { + dz_div_y.Resize(out.dims()); dev_ctx.template Alloc(&dz_div_y); funcs::DefaultElementwiseOperator Date: Mon, 18 Mar 2024 12:15:34 +0000 Subject: [PATCH 10/16] update test --- .../operators/elementwise/elementwise_div_op.cc | 2 +- .../elementwise/test_elementwise_op_grad_grad.h | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc index bcd785293c71e..4029be65a00d6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc @@ -107,7 +107,7 @@ class ElementwiseDivDoubleGradMaker : public framework::SingleGradOpMaker { op->SetType("elementwise_div_grad_grad"); op->SetInput("Y", this->Input("Y")); op->SetInput("Out", this->Input("Out")); - op->SetInput("Out@GRAD", this->Input("Out@GRAD")); + op->SetInput("Out@GRAD", this->Input(framework::GradVarName("Out"))); op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); op->SetInput("DDY", this->OutputGrad(framework::GradVarName("Y"))); op->SetInput("DX", this->Output(framework::GradVarName("X"))); diff --git a/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h b/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h index ab67c559532d9..3e772aa632e52 100644 --- a/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h +++ b/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h @@ -135,8 +135,18 @@ class TestElementwiseOpGradGrad { expected_outs_[out_name].data(), [](const float &l, const float &r) { return fabs(l - r) < 1e-8; }); #else - auto is_equal = - std::equal(out_ptr, out_ptr + numel, expected_outs_[out_name].data()); + bool is_equal; + if (op_type_ == "elementwise_div_grad_grad") { + is_equal = std::equal(out_ptr, + out_ptr + numel, + expected_outs_[out_name].data(), + [](const float &l, const float &r) { + return fabs(l - r) < 0.0005; + }); + } else { + is_equal = std::equal( + out_ptr, out_ptr + numel, expected_outs_[out_name].data()); + } #endif if (!is_equal) { all_equal = false; From 87350dc85c3c8a19403492df4eb6150e261b88c0 Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Sun, 24 Mar 2024 12:16:02 +0000 Subject: [PATCH 11/16] update ddout --- .../impl/elementwise_grad_kernel_impl.h | 309 +++++++++++++++--- .../test_elementwise_div_grad_grad.cc | 92 +++++- 2 files changed, 344 insertions(+), 57 deletions(-) diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index f0e42708073d5..2ead7cd004428 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" +#include "paddle/phi/kernels/funcs/elementwise_utils.h" namespace phi { @@ -174,24 +175,249 @@ struct DivDoubleDY_Only_DDY { } }; -// template -// struct DivDoubleDDOut { -// HOSTDEVICE T operator()(T x, T y, T out, T dout) const { -// return (x - out * y) / dout; -// } -// }; - -// template -// struct DivDoubleDDOut_Only_DDX { -// HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return x / dout; } -// }; - -// template -// struct DivDoubleDDOut_Only_DDY { -// HOSTDEVICE T operator()(T x, T y, T out, T dout) const { -// return -out * y / dout; -// } -// }; +// ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y +template +struct DivDoubleDDOut { + HOSTDEVICE T operator()(const T& ddx, + const T& ddy, + const T& y, + const T& out) const { + return (ddx - out * ddy) / y; + } +}; + +template +struct DivDoubleDDOut_Only_DDY { + HOSTDEVICE T operator()(const T& ddx, + const T& ddy, + const T& y, + const T& out) const { + return -out * ddy / y; + } +}; + +template +void ComputeDDoutWithoutBroadcast(const CPUContext& dev_ctx UNUSED, + const phi::DenseTensor& ddx, + const phi::DenseTensor& ddy, + const phi::DenseTensor& y, + const phi::DenseTensor& out, + phi::DenseTensor* ddout, + DDout_OP dout_op) { + auto out_numel = out.numel(); + auto* ddx_data = ddx.data(); + auto* ddy_data = ddy.data(); + auto* y_data = y.data(); + auto* out_data = out.data(); + auto* ddout_data = ddout->data(); + for (int i = 0; i < out_numel; i++) { + ddout_data[i] = dout_op(ddx_data[i], ddy_data[i], y_data[i], out_data[i]); + } +} + +template +void ComputeDDoutWithBroadcast(const CPUContext& dev_ctx UNUSED, + const phi::DenseTensor& ddx, + const phi::DenseTensor& ddy, + const phi::DenseTensor& y, + const phi::DenseTensor& out, + phi::DenseTensor* ddout, + const int* x_dims_array, + const int* y_dims_array, + const int* out_dims_array, + const int max_dim, + DDout_OP dout_op) { + auto out_numel = out.numel(); + auto* ddx_data = ddx.data(); + auto* ddy_data = ddy.data(); + auto* y_data = y.data(); + auto* out_data = out.data(); + auto* ddout_data = ddout->data(); + std::vector index_array(max_dim, 0); + for (int i = 0; i < out_numel; i++) { + int x_index = phi::funcs::GetElementwiseIndex( + x_dims_array, max_dim, index_array.data()); + int y_index = phi::funcs::GetElementwiseIndex( + y_dims_array, max_dim, index_array.data()); + ddout_data[i] = dout_op( + ddx_data[x_index], ddy_data[y_index], y_data[y_index], out_data[i]); + phi::funcs::UpdateElementwiseIndexArray( + out_dims_array, max_dim, index_array.data()); + } +} + +#if defined(__NVCC__) + +template +__global__ void ComputeDDoutWithoutBroadcastGPUKernel(const T* ddx_data, + const T* ddy_data, + const T* y_data, + const T* out_data, + T* ddout_data, + int numel, + DDout_OP dout_op) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= numel) return; + ddout_data[tid] = + dout_op(ddx_data[tid], ddy_data[tid], y_data[tid], out_data[tid]); +} +template +void ComputeDDoutWithoutBroadcast(const GPUContext& dev_ctx UNUSED, + const phi::DenseTensor& ddx, + const phi::DenseTensor& ddy, + const phi::DenseTensor& y, + const phi::DenseTensor& out, + phi::DenseTensor* ddout, + DDout_OP dout_op) { + auto out_numel = out.numel(); + auto* ddx_data = ddx.data(); + auto* ddy_data = ddy.data(); + auto* y_data = y.data(); + auto* out_data = out.data(); + auto* ddout_data = ddout->data(); + int block = 512; + int64_t grid = (out_numel + block - 1) / block; + auto stream = reinterpret_cast(dev_ctx).stream(); + ComputeDDoutWithoutBroadcastGPUKernel + <<>>( + ddx_data, ddy_data, y_data, out_data, ddout_data, out_numel, dout_op); +} + +template +__global__ void ComputeDDoutWithBroadcastGPUKernel(const T* ddx_data, + const T* ddy_data, + const T* y_data, + const T* out_data, + T* ddout_data, + int numel, + const int* x_dims_array, + const int* y_dims_array, + const int* out_dims_array, + const int max_dim, + DDout_OP dout_op) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= numel) return; + int x_index = 0, y_index = 0, x_index_prod = 1, y_index_prod = 1, + out_index = tid, dim_index; + for (int64_t i = max_dim - 1; i >= 0; i--) { + if (out_index == 0) break; + dim_index = out_index % out_dims_array[i]; + out_index = out_index / out_dims_array[i]; + if (x_dims_array[i] > 1) { + x_index += dim_index * x_index_prod; + x_index_prod *= x_dims_array[i]; + } + if (y_dims_array[i] > 1) { + y_index += dim_index * y_index_prod; + y_index_prod *= y_dims_array[i]; + } + } + ddout_data[tid] = dout_op( + ddx_data[x_index], ddy_data[y_index], y_data[y_index], out_data[tid]); +} + +template +void ComputeDDoutWithBroadcast(const GPUContext& dev_ctx UNUSED, + const phi::DenseTensor& ddx, + const phi::DenseTensor& ddy, + const phi::DenseTensor& y, + const phi::DenseTensor& out, + phi::DenseTensor* ddout, + const int* x_dims_array, + const int* y_dims_array, + const int* out_dims_array, + const int max_dim, + DDout_OP dout_op) { + auto out_numel = out.numel(); + auto* ddx_data = ddx.data(); + auto* ddy_data = ddy.data(); + auto* y_data = y.data(); + auto* out_data = out.data(); + auto* ddout_data = ddout->data(); + DenseTensor x_dims_array_gpu; + x_dims_array_gpu.Resize({max_dim}); + int* x_dims_array_gpu_data = dev_ctx.template Alloc(&x_dims_array_gpu); + cudaMemcpy(x_dims_array_gpu_data, + x_dims_array, + sizeof(int) * max_dim, + cudaMemcpyHostToDevice); + DenseTensor y_dims_array_gpu; + y_dims_array_gpu.Resize({max_dim}); + int* y_dims_array_gpu_data = dev_ctx.template Alloc(&y_dims_array_gpu); + cudaMemcpy(y_dims_array_gpu_data, + y_dims_array, + sizeof(int) * max_dim, + cudaMemcpyHostToDevice); + DenseTensor out_dims_array_gpu; + out_dims_array_gpu.Resize({max_dim}); + int* out_dims_array_gpu_data = + dev_ctx.template Alloc(&out_dims_array_gpu); + cudaMemcpy(out_dims_array_gpu_data, + out_dims_array, + sizeof(int) * max_dim, + cudaMemcpyHostToDevice); + int block = 512; + int64_t grid = (out_numel + block - 1) / block; + auto stream = reinterpret_cast(dev_ctx).stream(); + ComputeDDoutWithBroadcastGPUKernel + <<>>(ddx_data, + ddy_data, + y_data, + out_data, + ddout_data, + out_numel, + x_dims_array_gpu_data, + y_dims_array_gpu_data, + out_dims_array_gpu_data, + max_dim, + dout_op); +} + +#endif + +template +void DivDoubleDDoutCompute(const DeviceContext& dev_ctx, + const phi::DenseTensor& ddx, + const phi::DenseTensor& ddy, + const phi::DenseTensor& y, + const phi::DenseTensor& out, + int axis, + phi::DenseTensor* ddout, + DDout_OP dout_op) { + auto x_dims = ddx.dims(); + auto y_dims = ddy.dims(); + if (x_dims == y_dims) { + ComputeDDoutWithoutBroadcast( + dev_ctx, ddx, ddy, y, out, ddout, dout_op); + } else { + int max_dim = std::max(x_dims.size(), y_dims.size()); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + std::vector x_dims_array(max_dim, 0); + std::vector y_dims_array(max_dim, 0); + std::vector out_dims_array(max_dim, 0); + phi::funcs::GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + ComputeDDoutWithBroadcast(dev_ctx, + ddx, + ddy, + y, + out, + ddout, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + dout_op); + } +} template void DivideDoubleGradKernel(const Context& dev_ctx, @@ -318,36 +544,25 @@ void DivideDoubleGradKernel(const Context& dev_ctx, dev_ctx, *ddx_tensor, y, ddout, axis); } else if (!ddx_tensor && ddy_tensor) { // ddOut = - Out * ddY / Y - funcs::DefaultElementwiseOperator, - funcs::InverseMultiplyFunctor>( - dev_ctx, out, *ddy_tensor, &tmp, axis); - // VLOG(4) << "5"; - funcs::DefaultElementwiseOperator, - funcs::InverseDivideFunctor>( - dev_ctx, tmp, y, ddout, axis); - auto& place = *dev_ctx.eigen_device(); - auto ddout_result = phi::EigenVector::Flatten(*ddout); - ddout_result.device(place) = static_cast(-1) * ddout_result; + DivDoubleDDoutCompute, T>( + dev_ctx, + *ddx_tensor, + *ddy_tensor, + y, + out, + axis, + ddout, + DivDoubleDDOut_Only_DDY()); } else { - funcs::DefaultElementwiseOperator, - funcs::InverseMultiplyFunctor>( - dev_ctx, out, *ddy_tensor, &tmp, axis); - funcs::DefaultElementwiseOperator, - funcs::InverseSubtractFunctor>( - dev_ctx, *ddx_tensor, tmp, &tmp, axis); - funcs::DefaultElementwiseOperator, - funcs::InverseDivideFunctor>( - dev_ctx, tmp, y, ddout, axis); + DivDoubleDDoutCompute, T>( + dev_ctx, + *ddx_tensor, + *ddy_tensor, + y, + out, + axis, + ddout, + DivDoubleDDOut()); } } diff --git a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc index 8132b3b4bd295..257fa0289ef2a 100644 --- a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc +++ b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc @@ -41,17 +41,16 @@ namespace paddle { namespace operators { template -class TestElementwiseDivGradGradWithoutDout - : public TestElementwiseOpGradGrad { +class TestElementwiseDivGradGradWithDout : public TestElementwiseOpGradGrad { public: - TestElementwiseDivGradGradWithoutDout(const platform::Place &place, - const framework::DDim &dims) + TestElementwiseDivGradGradWithDout(const platform::Place &place, + const framework::DDim &dims) : TestElementwiseOpGradGrad( "elementwise_div_grad_grad", place, dims, {"Y", "Out", "Out@GRAD", "DDX", "DDY", "DX"}, - {"Y@GRAD", "DDOut"}) {} + {"Y@GRAD", "DDOut", "DOut"}) {} using TestElementwiseOpGradGrad::feed_datas_; using TestElementwiseOpGradGrad::expected_outs_; @@ -60,6 +59,7 @@ class TestElementwiseDivGradGradWithoutDout size_t numel = static_cast(common::product(dims_)); std::vector dy(numel); std::vector ddout(numel); + std::vector dout(numel); for (size_t i = 0; i < numel; ++i) { // dY(Y@GRAD) = Out * dX * ddY / Y - dX * ddX / Y dy[i] = (feed_datas_["DX"][i] / feed_datas_["Y"][i]) * @@ -69,9 +69,12 @@ class TestElementwiseDivGradGradWithoutDout ddout[i] = (feed_datas_["DDX"][i] - feed_datas_["Out"][i] * feed_datas_["DDY"][i]) / (feed_datas_["Y"][i]); + // dOut = - DX * DDy + dout[i] = -feed_datas_["DX"][i] * feed_datas_["DDY"][i]; } expected_outs_["Y@GRAD"] = dy; expected_outs_["DDOut"] = ddout; + expected_outs_["DOut"] = dout; } std::unique_ptr CreateTestOp() override { @@ -83,27 +86,96 @@ class TestElementwiseDivGradGradWithoutDout {"DDX", {"DDX"}}, {"DDY", {"DDY"}}, {"DX", {"DX"}}}, - {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}}, + {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}, {"DOut", {"DOut"}}}, {{"use_mkldnn", false}, {"axis", 0}}); return op; } }; -TEST(test_elementwise_div_grad_grad_without_dout, cpu_place) { +TEST(test_elementwise_div_grad_grad, cpu_place) { framework::DDim dims({32, 64}); platform::CPUPlace p; - TestElementwiseDivGradGradWithoutDout test(p, dims); + TestElementwiseDivGradGradWithDout test(p, dims); ASSERT_TRUE(test.Check()); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -TEST(test_elementwise_div_grad_grad_without_dout, gpu_place) { +TEST(test_elementwise_div_grad_grad, gpu_place) { framework::DDim dims({32, 64}); platform::CUDAPlace p(0); - TestElementwiseDivGradGradWithoutDout test(p, dims); + TestElementwiseDivGradGradWithDout test(p, dims); ASSERT_TRUE(test.Check()); } #endif +// template +// class TestElementwiseDivGradGradWithoutDX +// : public TestElementwiseOpGradGrad { +// public: +// TestElementwiseDivGradGradWithoutDX(const platform::Place &place, +// const framework::DDim &dims) +// : TestElementwiseOpGradGrad( +// "elementwise_div_grad_grad", +// place, +// dims, +// {"Y", "Out", "Out@GRAD", "DDX", "DDY"}, +// {"Y@GRAD", "DDOut", "DOut"}) {} + +// using TestElementwiseOpGradGrad::feed_datas_; +// using TestElementwiseOpGradGrad::expected_outs_; +// using TestElementwiseOpGradGrad::dims_; +// void ComputeExpectedOuts() override { +// size_t numel = static_cast(common::product(dims_)); +// std::vector dy(numel); +// std::vector ddout(numel); +// std::vector dout(numel); +// for (size_t i = 0; i < numel; ++i) { +// // dY(Y@GRAD) = Out * dX * ddY / Y - dX * ddX / Y +// auto dx = feed_datas_["Out@GRAD"][i] / feed_datas_["Y"][i]; +// dy[i] = (dx / feed_datas_["Y"][i]) * +// (feed_datas_["Out"][i] * feed_datas_["DDY"][i] - +// feed_datas_["DDX"][i]); +// // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y +// ddout[i] = (feed_datas_["DDX"][i] - +// feed_datas_["Out"][i] * feed_datas_["DDY"][i]) / +// (feed_datas_["Y"][i]); +// // dOut = - DX * DDy +// dout[i] = -dx * feed_datas_["DDY"][i]; +// } +// expected_outs_["Y@GRAD"] = dy; +// expected_outs_["DDOut"] = ddout; +// expected_outs_["DOut"] = dout; +// } + +// std::unique_ptr CreateTestOp() override { +// auto op = framework::OpRegistry::CreateOp( +// this->op_type_, +// {{"Y", {"Y"}}, +// {"Out", {"Out"}}, +// {"Out@GRAD", {"Out@GRAD"}}, +// {"DDX", {"DDX"}}, +// {"DDY", {"DDY"}}}, +// {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}, {"DOut", {"DOut"}}}, +// {{"use_mkldnn", false}, {"axis", 0}}); +// return op; +// } +// }; + +// TEST(test_elementwise_div_grad_grad_without_dx, cpu_place) { +// framework::DDim dims({32, 64}); +// platform::CPUPlace p; +// TestElementwiseDivGradGradWithoutDX test(p, dims); +// ASSERT_TRUE(test.Check()); +// } + +// #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +// TEST(test_elementwise_div_grad_grad_without_dx, gpu_place) { +// framework::DDim dims({32, 64}); +// platform::CUDAPlace p(0); +// TestElementwiseDivGradGradWithoutDX test(p, dims); +// ASSERT_TRUE(test.Check()); +// } +// #endif + } // namespace operators } // namespace paddle From 0f8b0f446f9d3a3509a6e4835972a39c96a88b00 Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Wed, 27 Mar 2024 06:24:04 +0000 Subject: [PATCH 12/16] update device --- paddle/phi/kernels/funcs/common_shape.h | 6 +- .../impl/elementwise_grad_kernel_impl.h | 61 +++++++++++++++- .../test_elementwise_div_grad_grad.cc | 69 ------------------- 3 files changed, 62 insertions(+), 74 deletions(-) diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h index 19f2fa1f2fac4..f148589262884 100644 --- a/paddle/phi/kernels/funcs/common_shape.h +++ b/paddle/phi/kernels/funcs/common_shape.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "glog/logging.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" @@ -52,7 +53,7 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims, "Axis should be less than or equal to %d, but received axis is %d.", max_dim, axis)); - + VLOG(4) << "start get shape\n"; if (x_dims.size() > y_dims.size()) { std::fill(y_dims_array, y_dims_array + axis, 1); if (axis + y_dims.size() < max_dim) { @@ -68,7 +69,7 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims, std::copy(x_dims.Get(), x_dims.Get() + x_dims.size(), x_dims_array + axis); std::copy(y_dims.Get(), y_dims.Get() + y_dims.size(), y_dims_array); } - + VLOG(4) << "start for\n"; for (int i = 0; i < max_dim; ++i) { PADDLE_ENFORCE_EQ( x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 || @@ -91,6 +92,7 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims, out_dims_array[i] = -1; } } + VLOG(4) << "end\n"; } inline void GetPrePostNumel( diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index 2ead7cd004428..85e54625e60bd 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -246,7 +246,7 @@ void ComputeDDoutWithBroadcast(const CPUContext& dev_ctx UNUSED, } } -#if defined(__NVCC__) +#if defined(__NVCC__) || defined(__HIPCC__) template __global__ void ComputeDDoutWithoutBroadcastGPUKernel(const T* ddx_data, @@ -337,25 +337,46 @@ void ComputeDDoutWithBroadcast(const GPUContext& dev_ctx UNUSED, DenseTensor x_dims_array_gpu; x_dims_array_gpu.Resize({max_dim}); int* x_dims_array_gpu_data = dev_ctx.template Alloc(&x_dims_array_gpu); +#if defined(__NVCC__) cudaMemcpy(x_dims_array_gpu_data, x_dims_array, sizeof(int) * max_dim, cudaMemcpyHostToDevice); +#else + hipMemcpy(x_dims_array_gpu_data, + x_dims_array, + sizeof(int) * max_dim, + hipMemcpyHostToDevice); +#endif DenseTensor y_dims_array_gpu; y_dims_array_gpu.Resize({max_dim}); int* y_dims_array_gpu_data = dev_ctx.template Alloc(&y_dims_array_gpu); +#if defined(__NVCC__) cudaMemcpy(y_dims_array_gpu_data, y_dims_array, sizeof(int) * max_dim, cudaMemcpyHostToDevice); +#else + hipMemcpy(y_dims_array_gpu_data, + y_dims_array, + sizeof(int) * max_dim, + hipMemcpyHostToDevice); +#endif DenseTensor out_dims_array_gpu; out_dims_array_gpu.Resize({max_dim}); int* out_dims_array_gpu_data = dev_ctx.template Alloc(&out_dims_array_gpu); +#if defined(__NVCC__) cudaMemcpy(out_dims_array_gpu_data, out_dims_array, sizeof(int) * max_dim, cudaMemcpyHostToDevice); +#else + hipMemcpy(out_dims_array_gpu_data, + out_dims_array, + sizeof(int) * max_dim, + hipMemcpyHostToDevice); +#endif int block = 512; int64_t grid = (out_numel + block - 1) / block; auto stream = reinterpret_cast(dev_ctx).stream(); @@ -543,17 +564,50 @@ void DivideDoubleGradKernel(const Context& dev_ctx, funcs::InverseDivideFunctor>( dev_ctx, *ddx_tensor, y, ddout, axis); } else if (!ddx_tensor && ddy_tensor) { - // ddOut = - Out * ddY / Y +// ddOut = - Out * ddY / Y +#if defined(__xpu__) + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, out, *ddy_tensor, &tmp, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseDivideFunctor>( + dev_ctx, tmp, y, ddout, axis); + auto& place = *dev_ctx.eigen_device(); + auto ddout_result = phi::EigenVector::Flatten(*ddout); + ddout_result.device(place) = static_cast(-1) * ddout_result; +#else DivDoubleDDoutCompute, T>( dev_ctx, - *ddx_tensor, + *dx_tensor, *ddy_tensor, y, out, axis, ddout, DivDoubleDDOut_Only_DDY()); +#endif } else { +#if defined(__xpu__) + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, out, *ddy_tensor, &tmp, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseSubtractFunctor>( + dev_ctx, *ddx_tensor, tmp, &tmp, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseDivideFunctor>( + dev_ctx, tmp, y, ddout, axis); +#else DivDoubleDDoutCompute, T>( dev_ctx, *ddx_tensor, @@ -563,6 +617,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx, axis, ddout, DivDoubleDDOut()); +#endif } } diff --git a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc index 257fa0289ef2a..a29cc2ea43f7c 100644 --- a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc +++ b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc @@ -108,74 +108,5 @@ TEST(test_elementwise_div_grad_grad, gpu_place) { } #endif -// template -// class TestElementwiseDivGradGradWithoutDX -// : public TestElementwiseOpGradGrad { -// public: -// TestElementwiseDivGradGradWithoutDX(const platform::Place &place, -// const framework::DDim &dims) -// : TestElementwiseOpGradGrad( -// "elementwise_div_grad_grad", -// place, -// dims, -// {"Y", "Out", "Out@GRAD", "DDX", "DDY"}, -// {"Y@GRAD", "DDOut", "DOut"}) {} - -// using TestElementwiseOpGradGrad::feed_datas_; -// using TestElementwiseOpGradGrad::expected_outs_; -// using TestElementwiseOpGradGrad::dims_; -// void ComputeExpectedOuts() override { -// size_t numel = static_cast(common::product(dims_)); -// std::vector dy(numel); -// std::vector ddout(numel); -// std::vector dout(numel); -// for (size_t i = 0; i < numel; ++i) { -// // dY(Y@GRAD) = Out * dX * ddY / Y - dX * ddX / Y -// auto dx = feed_datas_["Out@GRAD"][i] / feed_datas_["Y"][i]; -// dy[i] = (dx / feed_datas_["Y"][i]) * -// (feed_datas_["Out"][i] * feed_datas_["DDY"][i] - -// feed_datas_["DDX"][i]); -// // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y -// ddout[i] = (feed_datas_["DDX"][i] - -// feed_datas_["Out"][i] * feed_datas_["DDY"][i]) / -// (feed_datas_["Y"][i]); -// // dOut = - DX * DDy -// dout[i] = -dx * feed_datas_["DDY"][i]; -// } -// expected_outs_["Y@GRAD"] = dy; -// expected_outs_["DDOut"] = ddout; -// expected_outs_["DOut"] = dout; -// } - -// std::unique_ptr CreateTestOp() override { -// auto op = framework::OpRegistry::CreateOp( -// this->op_type_, -// {{"Y", {"Y"}}, -// {"Out", {"Out"}}, -// {"Out@GRAD", {"Out@GRAD"}}, -// {"DDX", {"DDX"}}, -// {"DDY", {"DDY"}}}, -// {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}, {"DOut", {"DOut"}}}, -// {{"use_mkldnn", false}, {"axis", 0}}); -// return op; -// } -// }; - -// TEST(test_elementwise_div_grad_grad_without_dx, cpu_place) { -// framework::DDim dims({32, 64}); -// platform::CPUPlace p; -// TestElementwiseDivGradGradWithoutDX test(p, dims); -// ASSERT_TRUE(test.Check()); -// } - -// #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -// TEST(test_elementwise_div_grad_grad_without_dx, gpu_place) { -// framework::DDim dims({32, 64}); -// platform::CUDAPlace p(0); -// TestElementwiseDivGradGradWithoutDX test(p, dims); -// ASSERT_TRUE(test.Check()); -// } -// #endif - } // namespace operators } // namespace paddle From 99e52810e277845e93bbd54ff329c20c165466f8 Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Wed, 27 Mar 2024 06:27:35 +0000 Subject: [PATCH 13/16] add constant --- .../phi/kernels/impl/elementwise_grad_kernel_impl.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index 85e54625e60bd..899df8710c0ba 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -165,12 +165,20 @@ struct DivDoubleDY { template struct DivDoubleDY_Only_DDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -x * dout; } + HOSTDEVICE T operator()(const T& x, + const T& y, + const T& out, + const T& dout) const { + return -x * dout; + } }; template struct DivDoubleDY_Only_DDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + HOSTDEVICE T operator()(const T& x, + const T& y, + const T& out, + const T& dout) const { return y * out * dout; } }; From 8016cf502fc8086086717316f0f5347698c3cf42 Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Wed, 27 Mar 2024 06:28:42 +0000 Subject: [PATCH 14/16] update --- paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index 899df8710c0ba..0ece81a3d2384 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -158,18 +158,11 @@ struct DivGradDY> { template struct DivDoubleDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return (y * out - x) * dout; - } -}; - -template -struct DivDoubleDY_Only_DDX { HOSTDEVICE T operator()(const T& x, const T& y, const T& out, const T& dout) const { - return -x * dout; + return (y * out - x) * dout; } }; From a718d8e2efda1fd68b327f476738a0c3a3721b9d Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Wed, 27 Mar 2024 12:01:46 +0000 Subject: [PATCH 15/16] fix bug --- paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index 0ece81a3d2384..4bd0ede6dc827 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -176,6 +176,16 @@ struct DivDoubleDY_Only_DDY { } }; +template +struct DivDoubleDY_Only_DDX { + HOSTDEVICE T operator()(const T& x, + const T& y, + const T& out, + const T& dout) const { + return -x * dout; + } +}; + // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y template struct DivDoubleDDOut { From 156198a72aadbf474301f175e400db5ef81ae6ec Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Sat, 30 Mar 2024 15:31:09 +0000 Subject: [PATCH 16/16] remove vlog --- paddle/phi/kernels/funcs/common_shape.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h index f148589262884..45a1024339ba3 100644 --- a/paddle/phi/kernels/funcs/common_shape.h +++ b/paddle/phi/kernels/funcs/common_shape.h @@ -14,7 +14,6 @@ limitations under the License. */ #pragma once -#include "glog/logging.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" @@ -53,7 +52,6 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims, "Axis should be less than or equal to %d, but received axis is %d.", max_dim, axis)); - VLOG(4) << "start get shape\n"; if (x_dims.size() > y_dims.size()) { std::fill(y_dims_array, y_dims_array + axis, 1); if (axis + y_dims.size() < max_dim) { @@ -69,7 +67,6 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims, std::copy(x_dims.Get(), x_dims.Get() + x_dims.size(), x_dims_array + axis); std::copy(y_dims.Get(), y_dims.Get() + y_dims.size(), y_dims_array); } - VLOG(4) << "start for\n"; for (int i = 0; i < max_dim; ++i) { PADDLE_ENFORCE_EQ( x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 || @@ -92,7 +89,6 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims, out_dims_array[i] = -1; } } - VLOG(4) << "end\n"; } inline void GetPrePostNumel(