From 467d580dde745ebe572eb38db795748b0407111c Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Thu, 7 Mar 2024 12:24:38 +0000
Subject: [PATCH 01/16] improve the performence of divide double grad

---
 .../impl/elementwise_grad_kernel_impl.h       | 186 +++++++++++-------
 1 file changed, 116 insertions(+), 70 deletions(-)
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index db6858bc9d7d7..0641c2601a1d3 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -173,26 +173,8 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                             DenseTensor* dy,
                             DenseTensor* dout,
                             DenseTensor* ddout) {
-  if (dy) {
-    dy->Resize(y.dims());
-    dev_ctx.template Alloc<T>(dy);
-  }
-  if (dout) {
-    dout->Resize(out.dims());
-    dev_ctx.template Alloc<T>(dout);
-  }
-  if (ddout) {
-    ddout->Resize(out.dims());
-    dev_ctx.template Alloc<T>(ddout);
-  }
-  // ddX_safe == null ? 0 : ddX
-  // ddY_safe == null ? 0 : ddY
-  DenseTensor ddX_safe, ddY_safe;
-  phi::funcs::GetDoubleGradSafeTensor<Context, T>(
-      dev_ctx, dx, ddx.get_ptr(), &ddX_safe);
-  phi::funcs::GetDoubleGradSafeTensor<Context, T>(
-      dev_ctx, y, ddy.get_ptr(), &ddY_safe);
-
+  auto* ddx_tensor = ddx.get_ptr();
+  auto* ddy_tensor = ddy.get_ptr();
   // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
   // dY = Out * dX * ddY / Y - dX * ddX / Y
   // dOut = - dX * ddY
@@ -206,63 +188,127 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(&tmp);
   }
   if (dy) {
-    // dX_div_Y = dX / Y;
-    DenseTensor dX_div_Y = tmp;
-    funcs::DefaultElementwiseOperator<Context,
-                                      T,
-                                      funcs::DivideFunctor<T>,
-                                      funcs::InverseDivideFunctor<T>>(
-        dev_ctx, dx, y, &dX_div_Y, axis);
-
-    // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
-    // first output tensor is nullptr, the branch to calculate first
-    // output tensor will not be activated, DivGradDx function will not
-    // be called and can be ignored, the first branch has little effect
-    // on running speed.
-
-    // dY = Out * dX * ddY / Y - dX * ddX / Y
-    phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivDoubleDY<T>>(
-        dev_ctx,
-        ddX_safe,
-        ddY_safe,
-        out,
-        dX_div_Y,
-        axis,
-        nullptr,
-        dy,
-        DivGradDX<T>(),
-        DivDoubleDY<T>());
+    dy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(dy);
+    if (ddx_tensor == nullptr && ddy_tensor == nullptr) {
+      dy = nullptr;
+    } else {
+      DenseTensor tmp_dy = tmp;
+      // dX / Y
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::DivideFunctor<T>,
+                                        funcs::InverseDivideFunctor<T>>(
+          dev_ctx, dx, y, &tmp_dy, axis);
+      if (ddx_tensor != nullptr && ddy_tensor == nullptr) {
+        // dy = -dX * ddX / Y
+        funcs::DefaultElementwiseOperator<Context,
+                                          T,
+                                          funcs::MultiplyFunctor<T>,
+                                          funcs::InverseMultiplyFunctor<T>>(
+            dev_ctx, *ddx_tensor, tmp_dy, dy, axis);
+        auto& place = *dev_ctx.eigen_device();
+        auto dy_result = phi::EigenVector<T>::Flatten(*dy);
+        dy_result.device(place) = static_cast<T>(-1) * dy_result;
+      } else if (ddx_tensor == nullptr && ddy_tensor != nullptr) {
+        // dY = Out * dX * ddY / Y
+        funcs::DefaultElementwiseOperator<Context,
+                                          T,
+                                          funcs::MultiplyFunctor<T>,
+                                          funcs::InverseMultiplyFunctor<T>>(
+            dev_ctx, *ddy_tensor, tmp_dy, &tmp_dy, axis);
+        funcs::DefaultElementwiseOperator<Context,
+                                          T,
+                                          funcs::MultiplyFunctor<T>,
+                                          funcs::InverseMultiplyFunctor<T>>(
+            dev_ctx, out, tmp_dy, dy, axis);
+      } else {
+        // dY = Out * dX * ddY / Y - dX * ddX / Y
+
+        // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
+        // first output tensor is nullptr, the branch to calculate first
+        // output tensor will not be activated, DivGradDx function will not
+        // be called and can be ignored, the first branch has little effect
+        // on running speed.
+        phi::funcs::
+            ElemwiseGradCompute<Context, T, DivGradDX<T>, DivDoubleDY<T>>(
+                dev_ctx,
+                *ddx_tensor,
+                *ddy_tensor,
+                out,
+                tmp_dy,
+                axis,
+                nullptr,
+                dy,
+                DivGradDX<T>(),
+                DivDoubleDY<T>());
+      }
+    }
   }
 
   if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
     // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-    funcs::DefaultElementwiseOperator<Context,
-                                      T,
-                                      funcs::MultiplyFunctor<T>,
-                                      funcs::InverseMultiplyFunctor<T>>(
-        dev_ctx, out, ddY_safe, &tmp, axis);
-    funcs::DefaultElementwiseOperator<Context,
-                                      T,
-                                      funcs::SubtractFunctor<T>,
-                                      funcs::InverseSubtractFunctor<T>>(
-        dev_ctx, ddX_safe, tmp, &tmp, axis);
-    funcs::DefaultElementwiseOperator<Context,
-                                      T,
-                                      funcs::DivideFunctor<T>,
-                                      funcs::InverseDivideFunctor<T>>(
-        dev_ctx, tmp, y, ddout, axis);
+    if (ddx_tensor == nullptr && ddy_tensor == nullptr) {
+      ddout = nullptr;
+    } else if (ddx_tensor != nullptr && ddy_tensor == nullptr) {
+      // ddOut = ddX / Y
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::DivideFunctor<T>,
+                                        funcs::InverseDivideFunctor<T>>(
+          dev_ctx, *ddx_tensor, y, ddout, axis);
+    } else if (ddx_tensor == nullptr && ddy_tensor != nullptr) {
+      // ddOut = - Out * ddY / Y
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, out, *ddy_tensor, &tmp, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::DivideFunctor<T>,
+                                        funcs::InverseDivideFunctor<T>>(
+          dev_ctx, tmp, y, ddout, axis);
+      auto& place = *dev_ctx.eigen_device();
+      auto ddout_result = phi::EigenVector<T>::Flatten(*ddout);
+      ddout_result.device(place) = static_cast<T>(-1) * ddout_result;
+    } else {
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, out, *ddy_tensor, &tmp, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::SubtractFunctor<T>,
+                                        funcs::InverseSubtractFunctor<T>>(
+          dev_ctx, *ddx_tensor, tmp, &tmp, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::DivideFunctor<T>,
+                                        funcs::InverseDivideFunctor<T>>(
+          dev_ctx, tmp, y, ddout, axis);
+    }
   }
 
   if (dout) {
-    // dOut = - dX * ddY
-    funcs::DefaultElementwiseOperator<Context,
-                                      T,
-                                      funcs::MultiplyFunctor<T>,
-                                      funcs::InverseMultiplyFunctor<T>>(
-        dev_ctx, dx, ddY_safe, dout, axis);
-    auto& place = *dev_ctx.eigen_device();
-    auto dout_result = phi::EigenVector<T>::Flatten(*dout);
-    dout_result.device(place) = static_cast<T>(-1) * dout_result;
+    dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout);
+    if (ddy_tensor == nullptr) {
+      dout = nullptr;
+    } else {
+      // dOut = - dX * ddY
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, dx, *ddy_tensor, dout, axis);
+      auto& place = *dev_ctx.eigen_device();
+      auto dout_result = phi::EigenVector<T>::Flatten(*dout);
+      dout_result.device(place) = static_cast<T>(-1) * dout_result;
+    }
   }
 }
 template <typename T, typename Context>

From 5563d5ee2e6a89c573bb6d8e6953bf27ef0aea7f Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Thu, 7 Mar 2024 16:05:52 +0000
Subject: [PATCH 02/16] fix infermeta

---
 paddle/phi/api/yaml/legacy_backward.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index e5529aa6c5efa..1ee9a29d7e644 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -179,7 +179,7 @@
   output : Tensor(y_grad), Tensor(out_grad), Tensor(grad_out_grad)
   infer_meta :
     func : GeneralTernaryGradInferMeta
-    param : [y, grad_x, grad_x]
+    param : [y, out, out]
   kernel :
     func : divide_double_grad
     data_type : out

From 2683315e88e32e3ad98ec3a59e3d63544344166f Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Sat, 9 Mar 2024 04:38:17 +0000
Subject: [PATCH 03/16] update

---
 .../ops_signature/elementwise_sig.cc          |  2 +-
 .../pir/dialect/operator/ir/ops_backward.yaml |  6 ++--
 paddle/phi/api/yaml/legacy_backward.yaml      |  4 +--
 .../kernels/elementwise_divide_grad_kernel.h  |  3 +-
 .../impl/elementwise_grad_kernel_impl.h       | 30 +++++++++++++++++--
 5 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/ops_signature/elementwise_sig.cc b/paddle/fluid/operators/ops_signature/elementwise_sig.cc
index b1150268fbad1..82f891bb48a00 100644
--- a/paddle/fluid/operators/ops_signature/elementwise_sig.cc
+++ b/paddle/fluid/operators/ops_signature/elementwise_sig.cc
@@ -168,7 +168,7 @@ KernelSignature ElementwiseDivGradOpArgumentMapping(
 KernelSignature ElementwiseDivDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx UNUSED) {
   return KernelSignature("divide_double_grad",
-                         {"Y", "Out", "DX", "DDX", "DDY"},
+                         {"Y", "Out", "Out@GRAD", "DX", "DDX", "DDY"},
                          {"axis"},
                          {"Y@GRAD", "DOut", "DDOut"});
 }
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml
index 7b3068a8ab6c9..9f115d7744650 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml
@@ -190,15 +190,15 @@
 
 - backward_op : divide_double_grad
   forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
-  args : (Tensor y, Tensor out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
+  args : (Tensor y, Tensor out, Tensor grad_out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
   output : Tensor(y_grad), Tensor(out_grad), Tensor(grad_out_grad)
   infer_meta :
     func : GeneralTernaryGradInferMeta
-    param : [y, grad_x, grad_x]
+    param : [y, out, out]
   kernel :
     func : divide_double_grad
     data_type : out
-  optional : grad_x_grad, grad_y_grad
+  optional : grad_x, grad_x_grad, grad_y_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_op : divide_grad
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 1ee9a29d7e644..a22a25eeda48a 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -175,7 +175,7 @@
 
 - backward_op : divide_double_grad
   forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
-  args : (Tensor y, Tensor out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
+  args : (Tensor y, Tensor out, Tensor grad_out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
   output : Tensor(y_grad), Tensor(out_grad), Tensor(grad_out_grad)
   infer_meta :
     func : GeneralTernaryGradInferMeta
@@ -183,7 +183,7 @@
   kernel :
     func : divide_double_grad
     data_type : out
-  optional : grad_x_grad, grad_y_grad
+  optional : grad_x, grad_x_grad, grad_y_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_op : divide_grad
diff --git a/paddle/phi/kernels/elementwise_divide_grad_kernel.h b/paddle/phi/kernels/elementwise_divide_grad_kernel.h
index c764f05c3983f..15b1e65a9cfdf 100644
--- a/paddle/phi/kernels/elementwise_divide_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_divide_grad_kernel.h
@@ -33,7 +33,8 @@ template <typename T, typename Context>
 void DivideDoubleGradKernel(const Context& dev_ctx,
                             const DenseTensor& y,
                             const DenseTensor& out,
-                            const DenseTensor& dx,
+                            const DenseTensor& grad_out,
+                            const paddle::optional<DenseTensor>& dx,
                             const paddle::optional<DenseTensor>& ddx,
                             const paddle::optional<DenseTensor>& ddy,
                             int axis,
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 0641c2601a1d3..d6d9bb1c726cc 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -166,7 +166,8 @@ template <typename T, typename Context>
 void DivideDoubleGradKernel(const Context& dev_ctx,
                             const DenseTensor& y,
                             const DenseTensor& out,
-                            const DenseTensor& dx,
+                            const DenseTensor& grad_out,
+                            const paddle::optional<DenseTensor>& dx,
                             const paddle::optional<DenseTensor>& ddx,
                             const paddle::optional<DenseTensor>& ddy,
                             int axis,
@@ -175,6 +176,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                             DenseTensor* ddout) {
   auto* ddx_tensor = ddx.get_ptr();
   auto* ddy_tensor = ddy.get_ptr();
+  auto* dx_tensor = dx.get_ptr();
   // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
   // dY = Out * dX * ddY / Y - dX * ddX / Y
   // dOut = - dX * ddY
@@ -193,13 +195,24 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
     if (ddx_tensor == nullptr && ddy_tensor == nullptr) {
       dy = nullptr;
     } else {
+      if (dx_tensor == nullptr || dx_tensor->dims() != out.dims()) {
+        DenseTensor dz_div_y;
+        dz_div_y.Resize(out.dims());
+        dev_ctx.template Alloc<T>(&dz_div_y);
+        funcs::DefaultElementwiseOperator<Context,
+                                          T,
+                                          funcs::DivideFunctor<T>,
+                                          funcs::InverseDivideFunctor<T>>(
+            dev_ctx, grad_out, y, &dz_div_y, axis);
+        dx_tensor = &dz_div_y;
+      }
       DenseTensor tmp_dy = tmp;
       // dX / Y
       funcs::DefaultElementwiseOperator<Context,
                                         T,
                                         funcs::DivideFunctor<T>,
                                         funcs::InverseDivideFunctor<T>>(
-          dev_ctx, dx, y, &tmp_dy, axis);
+          dev_ctx, *dx_tensor, y, &tmp_dy, axis);
       if (ddx_tensor != nullptr && ddy_tensor == nullptr) {
         // dy = -dX * ddX / Y
         funcs::DefaultElementwiseOperator<Context,
@@ -299,12 +312,23 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
     if (ddy_tensor == nullptr) {
       dout = nullptr;
     } else {
+      if (dx_tensor == nullptr || dx_tensor->dims() != out.dims()) {
+        DenseTensor dz_div_y;
+        dz_div_y.Resize(out.dims());
+        dev_ctx.template Alloc<T>(&dz_div_y);
+        funcs::DefaultElementwiseOperator<Context,
+                                          T,
+                                          funcs::DivideFunctor<T>,
+                                          funcs::InverseDivideFunctor<T>>(
+            dev_ctx, grad_out, y, &dz_div_y, axis);
+        dx_tensor = &dz_div_y;
+      }
       // dOut = - dX * ddY
       funcs::DefaultElementwiseOperator<Context,
                                         T,
                                         funcs::MultiplyFunctor<T>,
                                         funcs::InverseMultiplyFunctor<T>>(
-          dev_ctx, dx, *ddy_tensor, dout, axis);
+          dev_ctx, *dx_tensor, *ddy_tensor, dout, axis);
       auto& place = *dev_ctx.eigen_device();
       auto dout_result = phi::EigenVector<T>::Flatten(*dout);
       dout_result.device(place) = static_cast<T>(-1) * dout_result;

From f28f02b7790d254c1660386ce4dcc9f5ef471e42 Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Mon, 11 Mar 2024 11:44:13 +0000
Subject: [PATCH 04/16] fix some bug

---
 .../impl/elementwise_grad_kernel_impl.h       | 33 +++++++------------
 1 file changed, 11 insertions(+), 22 deletions(-)

diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index d6d9bb1c726cc..e1595df42bbfe 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -177,6 +177,17 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
   auto* ddx_tensor = ddx.get_ptr();
   auto* ddy_tensor = ddy.get_ptr();
   auto* dx_tensor = dx.get_ptr();
+  DenseTensor dz_div_y;
+  dz_div_y.Resize(out.dims());
+  if (dx_tensor == nullptr || dx_tensor->dims() != out.dims()) {
+    dev_ctx.template Alloc<T>(&dz_div_y);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::DivideFunctor<T>,
+                                      funcs::InverseDivideFunctor<T>>(
+        dev_ctx, grad_out, y, &dz_div_y, axis);
+    dx_tensor = &dz_div_y;
+  }
   // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
   // dY = Out * dX * ddY / Y - dX * ddX / Y
   // dOut = - dX * ddY
@@ -195,17 +206,6 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
     if (ddx_tensor == nullptr && ddy_tensor == nullptr) {
       dy = nullptr;
     } else {
-      if (dx_tensor == nullptr || dx_tensor->dims() != out.dims()) {
-        DenseTensor dz_div_y;
-        dz_div_y.Resize(out.dims());
-        dev_ctx.template Alloc<T>(&dz_div_y);
-        funcs::DefaultElementwiseOperator<Context,
-                                          T,
-                                          funcs::DivideFunctor<T>,
-                                          funcs::InverseDivideFunctor<T>>(
-            dev_ctx, grad_out, y, &dz_div_y, axis);
-        dx_tensor = &dz_div_y;
-      }
       DenseTensor tmp_dy = tmp;
       // dX / Y
       funcs::DefaultElementwiseOperator<Context,
@@ -312,17 +312,6 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
     if (ddy_tensor == nullptr) {
       dout = nullptr;
     } else {
-      if (dx_tensor == nullptr || dx_tensor->dims() != out.dims()) {
-        DenseTensor dz_div_y;
-        dz_div_y.Resize(out.dims());
-        dev_ctx.template Alloc<T>(&dz_div_y);
-        funcs::DefaultElementwiseOperator<Context,
-                                          T,
-                                          funcs::DivideFunctor<T>,
-                                          funcs::InverseDivideFunctor<T>>(
-            dev_ctx, grad_out, y, &dz_div_y, axis);
-        dx_tensor = &dz_div_y;
-      }
       // dOut = - dX * ddY
       funcs::DefaultElementwiseOperator<Context,
                                         T,

From 4cd0aa2caa949103d72eef744242d2560a338245 Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Tue, 12 Mar 2024 12:40:31 +0000
Subject: [PATCH 05/16] fix bug and update test

---
 .../impl/elementwise_grad_kernel_impl.h       | 20 +++++++++----------
 .../test_elementwise_div_grad_grad.cc         | 12 ++++++-----
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index e1595df42bbfe..06ab33267e040 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -179,7 +179,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
   auto* dx_tensor = dx.get_ptr();
   DenseTensor dz_div_y;
   dz_div_y.Resize(out.dims());
-  if (dx_tensor == nullptr || dx_tensor->dims() != out.dims()) {
+  if (!dx_tensor || dx_tensor->dims() != out.dims()) {
     dev_ctx.template Alloc<T>(&dz_div_y);
     funcs::DefaultElementwiseOperator<Context,
                                       T,
@@ -203,8 +203,8 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
   if (dy) {
     dy->Resize(y.dims());
     dev_ctx.template Alloc<T>(dy);
-    if (ddx_tensor == nullptr && ddy_tensor == nullptr) {
-      dy = nullptr;
+    if (!ddx_tensor && !ddy_tensor) {
+      FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), dy);
     } else {
       DenseTensor tmp_dy = tmp;
       // dX / Y
@@ -213,7 +213,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                                         funcs::DivideFunctor<T>,
                                         funcs::InverseDivideFunctor<T>>(
           dev_ctx, *dx_tensor, y, &tmp_dy, axis);
-      if (ddx_tensor != nullptr && ddy_tensor == nullptr) {
+      if (ddx_tensor && !ddy_tensor) {
         // dy = -dX * ddX / Y
         funcs::DefaultElementwiseOperator<Context,
                                           T,
@@ -223,7 +223,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
         auto& place = *dev_ctx.eigen_device();
         auto dy_result = phi::EigenVector<T>::Flatten(*dy);
         dy_result.device(place) = static_cast<T>(-1) * dy_result;
-      } else if (ddx_tensor == nullptr && ddy_tensor != nullptr) {
+      } else if (!ddx_tensor && ddy_tensor) {
         // dY = Out * dX * ddY / Y
         funcs::DefaultElementwiseOperator<Context,
                                           T,
@@ -263,8 +263,8 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
     ddout->Resize(out.dims());
     dev_ctx.template Alloc<T>(ddout);
     // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-    if (ddx_tensor == nullptr && ddy_tensor == nullptr) {
-      ddout = nullptr;
+    if (!ddx_tensor && !ddy_tensor) {
+      FullLikeKernel<T, Context>(dev_ctx, out, Scalar(0.0), out.dtype(), ddout);
     } else if (ddx_tensor != nullptr && ddy_tensor == nullptr) {
       // ddOut = ddX / Y
       funcs::DefaultElementwiseOperator<Context,
@@ -272,7 +272,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                                         funcs::DivideFunctor<T>,
                                         funcs::InverseDivideFunctor<T>>(
           dev_ctx, *ddx_tensor, y, ddout, axis);
-    } else if (ddx_tensor == nullptr && ddy_tensor != nullptr) {
+    } else if (!ddx_tensor && ddy_tensor) {
       // ddOut = - Out * ddY / Y
       funcs::DefaultElementwiseOperator<Context,
                                         T,
@@ -309,8 +309,8 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
   if (dout) {
     dout->Resize(out.dims());
     dev_ctx.template Alloc<T>(dout);
-    if (ddy_tensor == nullptr) {
-      dout = nullptr;
+    if (!ddy_tensor) {
+      FullLikeKernel<T, Context>(dev_ctx, out, Scalar(0.0), out.dtype(), dout);
     } else {
       // dOut = - dX * ddY
       funcs::DefaultElementwiseOperator<Context,
diff --git a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
index ddf1229cd0367..8132b3b4bd295 100644
--- a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
+++ b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
@@ -46,11 +46,12 @@ class TestElementwiseDivGradGradWithoutDout
  public:
   TestElementwiseDivGradGradWithoutDout(const platform::Place &place,
                                         const framework::DDim &dims)
-      : TestElementwiseOpGradGrad<T>("elementwise_div_grad_grad",
-                                     place,
-                                     dims,
-                                     {"Y", "Out", "DDX", "DDY", "DX"},
-                                     {"Y@GRAD", "DDOut"}) {}
+      : TestElementwiseOpGradGrad<T>(
+            "elementwise_div_grad_grad",
+            place,
+            dims,
+            {"Y", "Out", "Out@GRAD", "DDX", "DDY", "DX"},
+            {"Y@GRAD", "DDOut"}) {}
 
   using TestElementwiseOpGradGrad<T>::feed_datas_;
   using TestElementwiseOpGradGrad<T>::expected_outs_;
@@ -78,6 +79,7 @@ class TestElementwiseDivGradGradWithoutDout
         this->op_type_,
         {{"Y", {"Y"}},
          {"Out", {"Out"}},
+         {"Out@GRAD", {"Out@GRAD"}},
          {"DDX", {"DDX"}},
          {"DDY", {"DDY"}},
          {"DX", {"DX"}}},

From fd22057481e58cf48b8bab658fe1c5beedd0ca64 Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Wed, 13 Mar 2024 13:08:02 +0000
Subject: [PATCH 06/16] update

---
 .../impl/elementwise_grad_kernel_impl.h       | 204 +++++++++++++-----
 1 file changed, 150 insertions(+), 54 deletions(-)

diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 06ab33267e040..75e925ff95b0b 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -158,7 +158,38 @@ struct DivGradDY<phi::dtype::complex<T>> {
 template <typename T>
 struct DivDoubleDY {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return y * out * dout - x * dout;
+    return (y * out - x) * dout;
+  }
+};
+
+template <typename T>
+struct DivDoubleDY_Only_DDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -x * dout; }
+};
+
+template <typename T>
+struct DivDoubleDY_Only_DDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return y * out * dout;
+  }
+};
+
+template <typename T>
+struct DivDoubleDDOut {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return (x - out * y) / dout;
+  }
+};
+
+template <typename T>
+struct DivDoubleDDOut_Only_DDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return x / dout; }
+};
+
+template <typename T>
+struct DivDoubleDDOut_Only_DDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return -out * y / dout;
   }
 };
 
@@ -179,7 +210,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
   auto* dx_tensor = dx.get_ptr();
   DenseTensor dz_div_y;
   dz_div_y.Resize(out.dims());
-  if (!dx_tensor || dx_tensor->dims() != out.dims()) {
+  if ((dy || dout) && (!dx_tensor || dx_tensor->dims() != out.dims())) {
     dev_ctx.template Alloc<T>(&dz_div_y);
     funcs::DefaultElementwiseOperator<Context,
                                       T,
@@ -206,35 +237,67 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
     if (!ddx_tensor && !ddy_tensor) {
       FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), dy);
     } else {
-      DenseTensor tmp_dy = tmp;
+      // DenseTensor tmp_dy = tmp;
       // dX / Y
       funcs::DefaultElementwiseOperator<Context,
                                         T,
                                         funcs::DivideFunctor<T>,
                                         funcs::InverseDivideFunctor<T>>(
-          dev_ctx, *dx_tensor, y, &tmp_dy, axis);
+          dev_ctx, *dx_tensor, y, &tmp, axis);
       if (ddx_tensor && !ddy_tensor) {
         // dy = -dX * ddX / Y
-        funcs::DefaultElementwiseOperator<Context,
-                                          T,
-                                          funcs::MultiplyFunctor<T>,
-                                          funcs::InverseMultiplyFunctor<T>>(
-            dev_ctx, *ddx_tensor, tmp_dy, dy, axis);
-        auto& place = *dev_ctx.eigen_device();
-        auto dy_result = phi::EigenVector<T>::Flatten(*dy);
-        dy_result.device(place) = static_cast<T>(-1) * dy_result;
+        // funcs::DefaultElementwiseOperator<Context,
+        //                                   T,
+        //                                   funcs::MultiplyFunctor<T>,
+        //                                   funcs::InverseMultiplyFunctor<T>>(
+        //     dev_ctx, *ddx_tensor, tmp, dy, axis);
+        // auto& place = *dev_ctx.eigen_device();
+        // auto dy_result = phi::EigenVector<T>::Flatten(*dy);
+        // dy_result.device(place) = static_cast<T>(-1) * dy_result;
+
+        phi::funcs::ElemwiseGradCompute<Context,
+                                        T,
+                                        DivGradDX<T>,
+                                        DivDoubleDY_Only_DDX<T>>(
+            dev_ctx,
+            *ddx_tensor,  // ddx
+            y,
+            out,  // out
+            tmp,  // dX /Y
+            axis,
+            nullptr,
+            dy,
+            DivGradDX<T>(),
+            DivDoubleDY_Only_DDX<T>());
       } else if (!ddx_tensor && ddy_tensor) {
         // dY = Out * dX * ddY / Y
-        funcs::DefaultElementwiseOperator<Context,
-                                          T,
-                                          funcs::MultiplyFunctor<T>,
-                                          funcs::InverseMultiplyFunctor<T>>(
-            dev_ctx, *ddy_tensor, tmp_dy, &tmp_dy, axis);
-        funcs::DefaultElementwiseOperator<Context,
-                                          T,
-                                          funcs::MultiplyFunctor<T>,
-                                          funcs::InverseMultiplyFunctor<T>>(
-            dev_ctx, out, tmp_dy, dy, axis);
+        // VLOG(4) << "1";
+        // funcs::DefaultElementwiseOperator<Context,
+        //                                   T,
+        //                                   funcs::MultiplyFunctor<T>,
+        //                                   funcs::InverseMultiplyFunctor<T>>(
+        //     dev_ctx, *ddy_tensor, tmp, &tmp, axis);
+        // // VLOG(4) << "2";
+        // funcs::DefaultElementwiseOperator<Context,
+        //                                   T,
+        //                                   funcs::MultiplyFunctor<T>,
+        //                                   funcs::InverseMultiplyFunctor<T>>(
+        //     dev_ctx, out, tmp, dy, axis);
+        // VLOG(4) << "3";
+        phi::funcs::ElemwiseGradCompute<Context,
+                                        T,
+                                        DivGradDX<T>,
+                                        DivDoubleDY_Only_DDY<T>>(
+            dev_ctx,
+            out,
+            *ddy_tensor,  // ddy
+            out,          // out
+            tmp,          // dX / Y
+            axis,
+            nullptr,
+            dy,
+            DivGradDX<T>(),
+            DivDoubleDY_Only_DDY<T>());
       } else {
         // dY = Out * dX * ddY / Y - dX * ddX / Y
 
@@ -246,10 +309,10 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
         phi::funcs::
             ElemwiseGradCompute<Context, T, DivGradDX<T>, DivDoubleDY<T>>(
                 dev_ctx,
-                *ddx_tensor,
-                *ddy_tensor,
-                out,
-                tmp_dy,
+                *ddx_tensor,  // ddx
+                *ddy_tensor,  // ddy
+                out,          // out
+                tmp,          // dX / Y
                 axis,
                 nullptr,
                 dy,
@@ -273,36 +336,67 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                                         funcs::InverseDivideFunctor<T>>(
           dev_ctx, *ddx_tensor, y, ddout, axis);
     } else if (!ddx_tensor && ddy_tensor) {
+      // VLOG(4) << "4";
       // ddOut = - Out * ddY / Y
-      funcs::DefaultElementwiseOperator<Context,
-                                        T,
-                                        funcs::MultiplyFunctor<T>,
-                                        funcs::InverseMultiplyFunctor<T>>(
-          dev_ctx, out, *ddy_tensor, &tmp, axis);
-      funcs::DefaultElementwiseOperator<Context,
-                                        T,
-                                        funcs::DivideFunctor<T>,
-                                        funcs::InverseDivideFunctor<T>>(
-          dev_ctx, tmp, y, ddout, axis);
-      auto& place = *dev_ctx.eigen_device();
-      auto ddout_result = phi::EigenVector<T>::Flatten(*ddout);
-      ddout_result.device(place) = static_cast<T>(-1) * ddout_result;
+      // funcs::DefaultElementwiseOperator<Context,
+      //                                   T,
+      //                                   funcs::MultiplyFunctor<T>,
+      //                                   funcs::InverseMultiplyFunctor<T>>(
+      //     dev_ctx, out, *ddy_tensor, &tmp, axis);
+      // // VLOG(4) << "5";
+      // funcs::DefaultElementwiseOperator<Context,
+      //                                   T,
+      //                                   funcs::DivideFunctor<T>,
+      //                                   funcs::InverseDivideFunctor<T>>(
+      //     dev_ctx, tmp, y, ddout, axis);
+      // auto& place = *dev_ctx.eigen_device();
+      // auto ddout_result = phi::EigenVector<T>::Flatten(*ddout);
+      // // VLOG(4) << "6";
+      // ddout_result.device(place) = static_cast<T>(-1) * ddout_result;
+      phi::funcs::ElemwiseGradCompute<Context,
+                                      T,
+                                      DivGradDX<T>,
+                                      DivDoubleDDOut_Only_DDY<T>>(
+          dev_ctx,
+          out,
+          *ddy_tensor,  // ddy
+          out,          // out
+          y,            // Y
+          axis,
+          nullptr,
+          dy,
+          DivGradDX<T>(),
+          DivDoubleDDOut_Only_DDY<T>());
     } else {
-      funcs::DefaultElementwiseOperator<Context,
-                                        T,
-                                        funcs::MultiplyFunctor<T>,
-                                        funcs::InverseMultiplyFunctor<T>>(
-          dev_ctx, out, *ddy_tensor, &tmp, axis);
-      funcs::DefaultElementwiseOperator<Context,
-                                        T,
-                                        funcs::SubtractFunctor<T>,
-                                        funcs::InverseSubtractFunctor<T>>(
-          dev_ctx, *ddx_tensor, tmp, &tmp, axis);
-      funcs::DefaultElementwiseOperator<Context,
-                                        T,
-                                        funcs::DivideFunctor<T>,
-                                        funcs::InverseDivideFunctor<T>>(
-          dev_ctx, tmp, y, ddout, axis);
+      // funcs::DefaultElementwiseOperator<Context,
+      //                                   T,
+      //                                   funcs::MultiplyFunctor<T>,
+      //                                   funcs::InverseMultiplyFunctor<T>>(
+      //     dev_ctx, out, *ddy_tensor, &tmp, axis);
+      // funcs::DefaultElementwiseOperator<Context,
+      //                                   T,
+      //                                   funcs::SubtractFunctor<T>,
+      //                                   funcs::InverseSubtractFunctor<T>>(
+      //     dev_ctx, *ddx_tensor, tmp, &tmp, axis);
+      // funcs::DefaultElementwiseOperator<Context,
+      //                                   T,
+      //                                   funcs::DivideFunctor<T>,
+      //                                   funcs::InverseDivideFunctor<T>>(
+      //     dev_ctx, tmp, y, ddout, axis);
+      phi::funcs::ElemwiseGradCompute<Context,
+                                      T,
+                                      DivGradDX<T>,
+                                      DivDoubleDDOut_Only_DDX<T>>(
+          dev_ctx,
+          *ddx_tensor,
+          y,    // ddy
+          out,  // out
+          y,    // Y
+          axis,
+          nullptr,
+          dy,
+          DivGradDX<T>(),
+          DivDoubleDDOut_Only_DDX<T>());
     }
   }
 
@@ -313,6 +407,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
       FullLikeKernel<T, Context>(dev_ctx, out, Scalar(0.0), out.dtype(), dout);
     } else {
       // dOut = - dX * ddY
+      // VLOG(4) << "7";
       funcs::DefaultElementwiseOperator<Context,
                                         T,
                                         funcs::MultiplyFunctor<T>,
@@ -321,6 +416,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
       auto& place = *dev_ctx.eigen_device();
       auto dout_result = phi::EigenVector<T>::Flatten(*dout);
       dout_result.device(place) = static_cast<T>(-1) * dout_result;
+      // VLOG(4) << "8";
     }
   }
 }

From ef8f29d99ab5b968a03dae335538aae5cdf4757e Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Sat, 16 Mar 2024 08:35:41 +0000
Subject: [PATCH 07/16] fix bug

---
 .../elementwise/elementwise_div_op.cc         |   1 +
 .../impl/elementwise_grad_kernel_impl.h       | 130 ++++++++++--------
 2 files changed, 72 insertions(+), 59 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index 191890865fb89..bcd785293c71e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -107,6 +107,7 @@ class ElementwiseDivDoubleGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetType("elementwise_div_grad_grad");
     op->SetInput("Y", this->Input("Y"));
     op->SetInput("Out", this->Input("Out"));
+    op->SetInput("Out@GRAD", this->Input("Out@GRAD"));
     op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
     op->SetInput("DDY", this->OutputGrad(framework::GradVarName("Y")));
     op->SetInput("DX", this->Output(framework::GradVarName("X")));
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 75e925ff95b0b..bd5c72c02e98b 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -205,6 +205,18 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                             DenseTensor* dy,
                             DenseTensor* dout,
                             DenseTensor* ddout) {
+  if (dy) {
+    dy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(dy);
+  }
+  if (dout) {
+    dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
   auto* ddx_tensor = ddx.get_ptr();
   auto* ddy_tensor = ddy.get_ptr();
   auto* dx_tensor = dx.get_ptr();
@@ -289,7 +301,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                                         DivGradDX<T>,
                                         DivDoubleDY_Only_DDY<T>>(
             dev_ctx,
-            out,
+            *dx_tensor,
             *ddy_tensor,  // ddy
             out,          // out
             tmp,          // dX / Y
@@ -338,65 +350,65 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
     } else if (!ddx_tensor && ddy_tensor) {
       // VLOG(4) << "4";
       // ddOut = - Out * ddY / Y
-      // funcs::DefaultElementwiseOperator<Context,
-      //                                   T,
-      //                                   funcs::MultiplyFunctor<T>,
-      //                                   funcs::InverseMultiplyFunctor<T>>(
-      //     dev_ctx, out, *ddy_tensor, &tmp, axis);
-      // // VLOG(4) << "5";
-      // funcs::DefaultElementwiseOperator<Context,
-      //                                   T,
-      //                                   funcs::DivideFunctor<T>,
-      //                                   funcs::InverseDivideFunctor<T>>(
-      //     dev_ctx, tmp, y, ddout, axis);
-      // auto& place = *dev_ctx.eigen_device();
-      // auto ddout_result = phi::EigenVector<T>::Flatten(*ddout);
-      // // VLOG(4) << "6";
-      // ddout_result.device(place) = static_cast<T>(-1) * ddout_result;
-      phi::funcs::ElemwiseGradCompute<Context,
-                                      T,
-                                      DivGradDX<T>,
-                                      DivDoubleDDOut_Only_DDY<T>>(
-          dev_ctx,
-          out,
-          *ddy_tensor,  // ddy
-          out,          // out
-          y,            // Y
-          axis,
-          nullptr,
-          dy,
-          DivGradDX<T>(),
-          DivDoubleDDOut_Only_DDY<T>());
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, out, *ddy_tensor, &tmp, axis);
+      // VLOG(4) << "5";
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::DivideFunctor<T>,
+                                        funcs::InverseDivideFunctor<T>>(
+          dev_ctx, tmp, y, ddout, axis);
+      auto& place = *dev_ctx.eigen_device();
+      auto ddout_result = phi::EigenVector<T>::Flatten(*ddout);
+      // VLOG(4) << "6";
+      ddout_result.device(place) = static_cast<T>(-1) * ddout_result;
+      // phi::funcs::ElemwiseGradCompute<Context,
+      //                                 T,
+      //                                 DivGradDX<T>,
+      //                                 DivDoubleDDOut_Only_DDY<T>>(
+      //     dev_ctx,
+      //     out,
+      //     *ddy_tensor,  // ddy
+      //     out,          // out
+      //     y,            // Y
+      //     axis,
+      //     nullptr,
+      //     ddout,
+      //     DivGradDX<T>(),
+      //     DivDoubleDDOut_Only_DDY<T>());
     } else {
-      // funcs::DefaultElementwiseOperator<Context,
-      //                                   T,
-      //                                   funcs::MultiplyFunctor<T>,
-      //                                   funcs::InverseMultiplyFunctor<T>>(
-      //     dev_ctx, out, *ddy_tensor, &tmp, axis);
-      // funcs::DefaultElementwiseOperator<Context,
-      //                                   T,
-      //                                   funcs::SubtractFunctor<T>,
-      //                                   funcs::InverseSubtractFunctor<T>>(
-      //     dev_ctx, *ddx_tensor, tmp, &tmp, axis);
-      // funcs::DefaultElementwiseOperator<Context,
-      //                                   T,
-      //                                   funcs::DivideFunctor<T>,
-      //                                   funcs::InverseDivideFunctor<T>>(
-      //     dev_ctx, tmp, y, ddout, axis);
-      phi::funcs::ElemwiseGradCompute<Context,
-                                      T,
-                                      DivGradDX<T>,
-                                      DivDoubleDDOut_Only_DDX<T>>(
-          dev_ctx,
-          *ddx_tensor,
-          y,    // ddy
-          out,  // out
-          y,    // Y
-          axis,
-          nullptr,
-          dy,
-          DivGradDX<T>(),
-          DivDoubleDDOut_Only_DDX<T>());
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, out, *ddy_tensor, &tmp, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::SubtractFunctor<T>,
+                                        funcs::InverseSubtractFunctor<T>>(
+          dev_ctx, *ddx_tensor, tmp, &tmp, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::DivideFunctor<T>,
+                                        funcs::InverseDivideFunctor<T>>(
+          dev_ctx, tmp, y, ddout, axis);
+      // phi::funcs::ElemwiseGradCompute<Context,
+      //                                 T,
+      //                                 DivGradDX<T>,
+      //                                 DivDoubleDDOut<T>>(
+      //     dev_ctx,
+      //     *ddx_tensor,
+      //     *ddy_tensor,    // ddy
+      //     out,  // out
+      //     y,    // Y
+      //     axis,
+      //     nullptr,
+      //     ddout,
+      //     DivGradDX<T>(),
+      //     DivDoubleDDOut<T>());
     }
   }
 

From 96c6cc6e193f09467482643bcae12dd5d8bfc69e Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Sat, 16 Mar 2024 09:06:36 +0000
Subject: [PATCH 08/16] update

---
 .../impl/elementwise_grad_kernel_impl.h       | 118 ++++--------------
 1 file changed, 27 insertions(+), 91 deletions(-)

diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index bd5c72c02e98b..15f15ac9a38bc 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -174,24 +174,24 @@ struct DivDoubleDY_Only_DDY {
   }
 };
 
-template <typename T>
-struct DivDoubleDDOut {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return (x - out * y) / dout;
-  }
-};
-
-template <typename T>
-struct DivDoubleDDOut_Only_DDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return x / dout; }
-};
-
-template <typename T>
-struct DivDoubleDDOut_Only_DDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return -out * y / dout;
-  }
-};
+// template <typename T>
+// struct DivDoubleDDOut {
+//   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+//     return (x - out * y) / dout;
+//   }
+// };
+
+// template <typename T>
+// struct DivDoubleDDOut_Only_DDX {
+//   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return x / dout; }
+// };
+
+// template <typename T>
+// struct DivDoubleDDOut_Only_DDY {
+//   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+//     return -out * y / dout;
+//   }
+// };
 
 template <typename T, typename Context>
 void DivideDoubleGradKernel(const Context& dev_ctx,
@@ -205,18 +205,6 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                             DenseTensor* dy,
                             DenseTensor* dout,
                             DenseTensor* ddout) {
-  if (dy) {
-    dy->Resize(y.dims());
-    dev_ctx.template Alloc<T>(dy);
-  }
-  if (dout) {
-    dout->Resize(out.dims());
-    dev_ctx.template Alloc<T>(dout);
-  }
-  if (ddout) {
-    ddout->Resize(out.dims());
-    dev_ctx.template Alloc<T>(ddout);
-  }
   auto* ddx_tensor = ddx.get_ptr();
   auto* ddy_tensor = ddy.get_ptr();
   auto* dx_tensor = dx.get_ptr();
@@ -238,6 +226,8 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
   // inplace ddx
   DenseTensor tmp;
   if (dout) {
+    dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout);
     tmp = *dout;
   } else {
     tmp.Resize(out.dims());
@@ -247,10 +237,10 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
     dy->Resize(y.dims());
     dev_ctx.template Alloc<T>(dy);
     if (!ddx_tensor && !ddy_tensor) {
-      FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), dy);
+      FullLikeKernel<T, Context>(
+          dev_ctx, y, Scalar(static_cast<T>(0.0)), y.dtype(), dy);
     } else {
-      // DenseTensor tmp_dy = tmp;
-      // dX / Y
+      // pre-compute 'dX / Y' into 'tmp' for 'ddout' and/or 'dy'
       funcs::DefaultElementwiseOperator<Context,
                                         T,
                                         funcs::DivideFunctor<T>,
@@ -258,15 +248,6 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
           dev_ctx, *dx_tensor, y, &tmp, axis);
       if (ddx_tensor && !ddy_tensor) {
         // dy = -dX * ddX / Y
-        // funcs::DefaultElementwiseOperator<Context,
-        //                                   T,
-        //                                   funcs::MultiplyFunctor<T>,
-        //                                   funcs::InverseMultiplyFunctor<T>>(
-        //     dev_ctx, *ddx_tensor, tmp, dy, axis);
-        // auto& place = *dev_ctx.eigen_device();
-        // auto dy_result = phi::EigenVector<T>::Flatten(*dy);
-        // dy_result.device(place) = static_cast<T>(-1) * dy_result;
-
         phi::funcs::ElemwiseGradCompute<Context,
                                         T,
                                         DivGradDX<T>,
@@ -283,19 +264,6 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
             DivDoubleDY_Only_DDX<T>());
       } else if (!ddx_tensor && ddy_tensor) {
         // dY = Out * dX * ddY / Y
-        // VLOG(4) << "1";
-        // funcs::DefaultElementwiseOperator<Context,
-        //                                   T,
-        //                                   funcs::MultiplyFunctor<T>,
-        //                                   funcs::InverseMultiplyFunctor<T>>(
-        //     dev_ctx, *ddy_tensor, tmp, &tmp, axis);
-        // // VLOG(4) << "2";
-        // funcs::DefaultElementwiseOperator<Context,
-        //                                   T,
-        //                                   funcs::MultiplyFunctor<T>,
-        //                                   funcs::InverseMultiplyFunctor<T>>(
-        //     dev_ctx, out, tmp, dy, axis);
-        // VLOG(4) << "3";
         phi::funcs::ElemwiseGradCompute<Context,
                                         T,
                                         DivGradDX<T>,
@@ -339,7 +307,8 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(ddout);
     // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
     if (!ddx_tensor && !ddy_tensor) {
-      FullLikeKernel<T, Context>(dev_ctx, out, Scalar(0.0), out.dtype(), ddout);
+      FullLikeKernel<T, Context>(
+          dev_ctx, out, Scalar(static_cast<T>(0.0)), out.dtype(), ddout);
     } else if (ddx_tensor != nullptr && ddy_tensor == nullptr) {
       // ddOut = ddX / Y
       funcs::DefaultElementwiseOperator<Context,
@@ -348,7 +317,6 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                                         funcs::InverseDivideFunctor<T>>(
           dev_ctx, *ddx_tensor, y, ddout, axis);
     } else if (!ddx_tensor && ddy_tensor) {
-      // VLOG(4) << "4";
       // ddOut = - Out * ddY / Y
       funcs::DefaultElementwiseOperator<Context,
                                         T,
@@ -363,22 +331,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
           dev_ctx, tmp, y, ddout, axis);
       auto& place = *dev_ctx.eigen_device();
       auto ddout_result = phi::EigenVector<T>::Flatten(*ddout);
-      // VLOG(4) << "6";
       ddout_result.device(place) = static_cast<T>(-1) * ddout_result;
-      // phi::funcs::ElemwiseGradCompute<Context,
-      //                                 T,
-      //                                 DivGradDX<T>,
-      //                                 DivDoubleDDOut_Only_DDY<T>>(
-      //     dev_ctx,
-      //     out,
-      //     *ddy_tensor,  // ddy
-      //     out,          // out
-      //     y,            // Y
-      //     axis,
-      //     nullptr,
-      //     ddout,
-      //     DivGradDX<T>(),
-      //     DivDoubleDDOut_Only_DDY<T>());
     } else {
       funcs::DefaultElementwiseOperator<Context,
                                         T,
@@ -395,31 +348,15 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                                         funcs::DivideFunctor<T>,
                                         funcs::InverseDivideFunctor<T>>(
           dev_ctx, tmp, y, ddout, axis);
-      // phi::funcs::ElemwiseGradCompute<Context,
-      //                                 T,
-      //                                 DivGradDX<T>,
-      //                                 DivDoubleDDOut<T>>(
-      //     dev_ctx,
-      //     *ddx_tensor,
-      //     *ddy_tensor,    // ddy
-      //     out,  // out
-      //     y,    // Y
-      //     axis,
-      //     nullptr,
-      //     ddout,
-      //     DivGradDX<T>(),
-      //     DivDoubleDDOut<T>());
     }
   }
 
   if (dout) {
-    dout->Resize(out.dims());
-    dev_ctx.template Alloc<T>(dout);
     if (!ddy_tensor) {
-      FullLikeKernel<T, Context>(dev_ctx, out, Scalar(0.0), out.dtype(), dout);
+      FullLikeKernel<T, Context>(
+          dev_ctx, out, Scalar(static_cast<T>(0.0)), out.dtype(), dout);
     } else {
       // dOut = - dX * ddY
-      // VLOG(4) << "7";
       funcs::DefaultElementwiseOperator<Context,
                                         T,
                                         funcs::MultiplyFunctor<T>,
@@ -428,7 +365,6 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
       auto& place = *dev_ctx.eigen_device();
       auto dout_result = phi::EigenVector<T>::Flatten(*dout);
       dout_result.device(place) = static_cast<T>(-1) * dout_result;
-      // VLOG(4) << "8";
     }
   }
 }

From a74a2fa63baa89fcaab3ac7005b74b7eb1ea556e Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Sat, 16 Mar 2024 09:16:01 +0000
Subject: [PATCH 09/16] update

---
 paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 15f15ac9a38bc..f0e42708073d5 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -209,8 +209,8 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
   auto* ddy_tensor = ddy.get_ptr();
   auto* dx_tensor = dx.get_ptr();
   DenseTensor dz_div_y;
-  dz_div_y.Resize(out.dims());
   if ((dy || dout) && (!dx_tensor || dx_tensor->dims() != out.dims())) {
+    dz_div_y.Resize(out.dims());
     dev_ctx.template Alloc<T>(&dz_div_y);
     funcs::DefaultElementwiseOperator<Context,
                                       T,

From d22d8480041bee3bdeba3380ac29944c5635807d Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Mon, 18 Mar 2024 12:15:34 +0000
Subject: [PATCH 10/16] update test

---
 .../operators/elementwise/elementwise_div_op.cc    |  2 +-
 .../elementwise/test_elementwise_op_grad_grad.h    | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index bcd785293c71e..4029be65a00d6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -107,7 +107,7 @@ class ElementwiseDivDoubleGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetType("elementwise_div_grad_grad");
     op->SetInput("Y", this->Input("Y"));
     op->SetInput("Out", this->Input("Out"));
-    op->SetInput("Out@GRAD", this->Input("Out@GRAD"));
+    op->SetInput("Out@GRAD", this->Input(framework::GradVarName("Out")));
     op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
     op->SetInput("DDY", this->OutputGrad(framework::GradVarName("Y")));
     op->SetInput("DX", this->Output(framework::GradVarName("X")));
diff --git a/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h b/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h
index ab67c559532d9..3e772aa632e52 100644
--- a/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h
+++ b/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h
@@ -135,8 +135,18 @@ class TestElementwiseOpGradGrad {
           expected_outs_[out_name].data(),
           [](const float &l, const float &r) { return fabs(l - r) < 1e-8; });
 #else
-      auto is_equal =
-          std::equal(out_ptr, out_ptr + numel, expected_outs_[out_name].data());
+      bool is_equal;
+      if (op_type_ == "elementwise_div_grad_grad") {
+        is_equal = std::equal(out_ptr,
+                              out_ptr + numel,
+                              expected_outs_[out_name].data(),
+                              [](const float &l, const float &r) {
+                                return fabs(l - r) < 0.0005;
+                              });
+      } else {
+        is_equal = std::equal(
+            out_ptr, out_ptr + numel, expected_outs_[out_name].data());
+      }
 #endif
       if (!is_equal) {
         all_equal = false;

From 87350dc85c3c8a19403492df4eb6150e261b88c0 Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Sun, 24 Mar 2024 12:16:02 +0000
Subject: [PATCH 11/16] update ddout

---
 .../impl/elementwise_grad_kernel_impl.h       | 309 +++++++++++++++---
 .../test_elementwise_div_grad_grad.cc         |  92 +++++-
 2 files changed, 344 insertions(+), 57 deletions(-)

diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index f0e42708073d5..2ead7cd004428 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise_utils.h"
 
 namespace phi {
 
@@ -174,24 +175,249 @@ struct DivDoubleDY_Only_DDY {
   }
 };
 
-// template <typename T>
-// struct DivDoubleDDOut {
-//   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-//     return (x - out * y) / dout;
-//   }
-// };
-
-// template <typename T>
-// struct DivDoubleDDOut_Only_DDX {
-//   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return x / dout; }
-// };
-
-// template <typename T>
-// struct DivDoubleDDOut_Only_DDY {
-//   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-//     return -out * y / dout;
-//   }
-// };
+// ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
+template <typename T>
+struct DivDoubleDDOut {
+  HOSTDEVICE T operator()(const T& ddx,
+                          const T& ddy,
+                          const T& y,
+                          const T& out) const {
+    return (ddx - out * ddy) / y;
+  }
+};
+
+template <typename T>
+struct DivDoubleDDOut_Only_DDY {
+  HOSTDEVICE T operator()(const T& ddx,
+                          const T& ddy,
+                          const T& y,
+                          const T& out) const {
+    return -out * ddy / y;
+  }
+};
+
+template <typename T, typename DDout_OP, typename OutType = T>
+void ComputeDDoutWithoutBroadcast(const CPUContext& dev_ctx UNUSED,
+                                  const phi::DenseTensor& ddx,
+                                  const phi::DenseTensor& ddy,
+                                  const phi::DenseTensor& y,
+                                  const phi::DenseTensor& out,
+                                  phi::DenseTensor* ddout,
+                                  DDout_OP dout_op) {
+  auto out_numel = out.numel();
+  auto* ddx_data = ddx.data<T>();
+  auto* ddy_data = ddy.data<T>();
+  auto* y_data = y.data<T>();
+  auto* out_data = out.data<T>();
+  auto* ddout_data = ddout->data<T>();
+  for (int i = 0; i < out_numel; i++) {
+    ddout_data[i] = dout_op(ddx_data[i], ddy_data[i], y_data[i], out_data[i]);
+  }
+}
+
+template <typename T, typename DDout_OP, typename OutType = T>
+void ComputeDDoutWithBroadcast(const CPUContext& dev_ctx UNUSED,
+                               const phi::DenseTensor& ddx,
+                               const phi::DenseTensor& ddy,
+                               const phi::DenseTensor& y,
+                               const phi::DenseTensor& out,
+                               phi::DenseTensor* ddout,
+                               const int* x_dims_array,
+                               const int* y_dims_array,
+                               const int* out_dims_array,
+                               const int max_dim,
+                               DDout_OP dout_op) {
+  auto out_numel = out.numel();
+  auto* ddx_data = ddx.data<T>();
+  auto* ddy_data = ddy.data<T>();
+  auto* y_data = y.data<T>();
+  auto* out_data = out.data<T>();
+  auto* ddout_data = ddout->data<T>();
+  std::vector<int> index_array(max_dim, 0);
+  for (int i = 0; i < out_numel; i++) {
+    int x_index = phi::funcs::GetElementwiseIndex(
+        x_dims_array, max_dim, index_array.data());
+    int y_index = phi::funcs::GetElementwiseIndex(
+        y_dims_array, max_dim, index_array.data());
+    ddout_data[i] = dout_op(
+        ddx_data[x_index], ddy_data[y_index], y_data[y_index], out_data[i]);
+    phi::funcs::UpdateElementwiseIndexArray(
+        out_dims_array, max_dim, index_array.data());
+  }
+}
+
+#if defined(__NVCC__)
+
+template <typename T, typename DDout_OP, typename OutType = T>
+__global__ void ComputeDDoutWithoutBroadcastGPUKernel(const T* ddx_data,
+                                                      const T* ddy_data,
+                                                      const T* y_data,
+                                                      const T* out_data,
+                                                      T* ddout_data,
+                                                      int numel,
+                                                      DDout_OP dout_op) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  ddout_data[tid] =
+      dout_op(ddx_data[tid], ddy_data[tid], y_data[tid], out_data[tid]);
+}
+template <typename T, typename DDout_OP, typename OutType = T>
+void ComputeDDoutWithoutBroadcast(const GPUContext& dev_ctx UNUSED,
+                                  const phi::DenseTensor& ddx,
+                                  const phi::DenseTensor& ddy,
+                                  const phi::DenseTensor& y,
+                                  const phi::DenseTensor& out,
+                                  phi::DenseTensor* ddout,
+                                  DDout_OP dout_op) {
+  auto out_numel = out.numel();
+  auto* ddx_data = ddx.data<T>();
+  auto* ddy_data = ddy.data<T>();
+  auto* y_data = y.data<T>();
+  auto* out_data = out.data<T>();
+  auto* ddout_data = ddout->data<T>();
+  int block = 512;
+  int64_t grid = (out_numel + block - 1) / block;
+  auto stream = reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
+  ComputeDDoutWithoutBroadcastGPUKernel<T, DDout_OP, T>
+      <<<grid, block, 0, stream>>>(
+          ddx_data, ddy_data, y_data, out_data, ddout_data, out_numel, dout_op);
+}
+
+template <typename T, typename DDout_OP, typename OutType = T>
+__global__ void ComputeDDoutWithBroadcastGPUKernel(const T* ddx_data,
+                                                   const T* ddy_data,
+                                                   const T* y_data,
+                                                   const T* out_data,
+                                                   T* ddout_data,
+                                                   int numel,
+                                                   const int* x_dims_array,
+                                                   const int* y_dims_array,
+                                                   const int* out_dims_array,
+                                                   const int max_dim,
+                                                   DDout_OP dout_op) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int x_index = 0, y_index = 0, x_index_prod = 1, y_index_prod = 1,
+      out_index = tid, dim_index;
+  for (int64_t i = max_dim - 1; i >= 0; i--) {
+    if (out_index == 0) break;
+    dim_index = out_index % out_dims_array[i];
+    out_index = out_index / out_dims_array[i];
+    if (x_dims_array[i] > 1) {
+      x_index += dim_index * x_index_prod;
+      x_index_prod *= x_dims_array[i];
+    }
+    if (y_dims_array[i] > 1) {
+      y_index += dim_index * y_index_prod;
+      y_index_prod *= y_dims_array[i];
+    }
+  }
+  ddout_data[tid] = dout_op(
+      ddx_data[x_index], ddy_data[y_index], y_data[y_index], out_data[tid]);
+}
+
+template <typename T, typename DDout_OP, typename OutType = T>
+void ComputeDDoutWithBroadcast(const GPUContext& dev_ctx UNUSED,
+                               const phi::DenseTensor& ddx,
+                               const phi::DenseTensor& ddy,
+                               const phi::DenseTensor& y,
+                               const phi::DenseTensor& out,
+                               phi::DenseTensor* ddout,
+                               const int* x_dims_array,
+                               const int* y_dims_array,
+                               const int* out_dims_array,
+                               const int max_dim,
+                               DDout_OP dout_op) {
+  auto out_numel = out.numel();
+  auto* ddx_data = ddx.data<T>();
+  auto* ddy_data = ddy.data<T>();
+  auto* y_data = y.data<T>();
+  auto* out_data = out.data<T>();
+  auto* ddout_data = ddout->data<T>();
+  DenseTensor x_dims_array_gpu;
+  x_dims_array_gpu.Resize({max_dim});
+  int* x_dims_array_gpu_data = dev_ctx.template Alloc<int>(&x_dims_array_gpu);
+  cudaMemcpy(x_dims_array_gpu_data,
+             x_dims_array,
+             sizeof(int) * max_dim,
+             cudaMemcpyHostToDevice);
+  DenseTensor y_dims_array_gpu;
+  y_dims_array_gpu.Resize({max_dim});
+  int* y_dims_array_gpu_data = dev_ctx.template Alloc<int>(&y_dims_array_gpu);
+  cudaMemcpy(y_dims_array_gpu_data,
+             y_dims_array,
+             sizeof(int) * max_dim,
+             cudaMemcpyHostToDevice);
+  DenseTensor out_dims_array_gpu;
+  out_dims_array_gpu.Resize({max_dim});
+  int* out_dims_array_gpu_data =
+      dev_ctx.template Alloc<int>(&out_dims_array_gpu);
+  cudaMemcpy(out_dims_array_gpu_data,
+             out_dims_array,
+             sizeof(int) * max_dim,
+             cudaMemcpyHostToDevice);
+  int block = 512;
+  int64_t grid = (out_numel + block - 1) / block;
+  auto stream = reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
+  ComputeDDoutWithBroadcastGPUKernel<T, DDout_OP, T>
+      <<<grid, block, 0, stream>>>(ddx_data,
+                                   ddy_data,
+                                   y_data,
+                                   out_data,
+                                   ddout_data,
+                                   out_numel,
+                                   x_dims_array_gpu_data,
+                                   y_dims_array_gpu_data,
+                                   out_dims_array_gpu_data,
+                                   max_dim,
+                                   dout_op);
+}
+
+#endif
+
+template <typename DeviceContext,
+          typename T,
+          typename DDout_OP,
+          typename Tout = T>
+void DivDoubleDDoutCompute(const DeviceContext& dev_ctx,
+                           const phi::DenseTensor& ddx,
+                           const phi::DenseTensor& ddy,
+                           const phi::DenseTensor& y,
+                           const phi::DenseTensor& out,
+                           int axis,
+                           phi::DenseTensor* ddout,
+                           DDout_OP dout_op) {
+  auto x_dims = ddx.dims();
+  auto y_dims = ddy.dims();
+  if (x_dims == y_dims) {
+    ComputeDDoutWithoutBroadcast<T, DDout_OP, T>(
+        dev_ctx, ddx, ddy, y, out, ddout, dout_op);
+  } else {
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+    std::vector<int> x_dims_array(max_dim, 0);
+    std::vector<int> y_dims_array(max_dim, 0);
+    std::vector<int> out_dims_array(max_dim, 0);
+    phi::funcs::GetBroadcastDimsArrays(x_dims,
+                                       y_dims,
+                                       x_dims_array.data(),
+                                       y_dims_array.data(),
+                                       out_dims_array.data(),
+                                       max_dim,
+                                       axis);
+    ComputeDDoutWithBroadcast<T, DDout_OP, T>(dev_ctx,
+                                              ddx,
+                                              ddy,
+                                              y,
+                                              out,
+                                              ddout,
+                                              x_dims_array.data(),
+                                              y_dims_array.data(),
+                                              out_dims_array.data(),
+                                              max_dim,
+                                              dout_op);
+  }
+}
 
 template <typename T, typename Context>
 void DivideDoubleGradKernel(const Context& dev_ctx,
@@ -318,36 +544,25 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
           dev_ctx, *ddx_tensor, y, ddout, axis);
     } else if (!ddx_tensor && ddy_tensor) {
       // ddOut = - Out * ddY / Y
-      funcs::DefaultElementwiseOperator<Context,
-                                        T,
-                                        funcs::MultiplyFunctor<T>,
-                                        funcs::InverseMultiplyFunctor<T>>(
-          dev_ctx, out, *ddy_tensor, &tmp, axis);
-      // VLOG(4) << "5";
-      funcs::DefaultElementwiseOperator<Context,
-                                        T,
-                                        funcs::DivideFunctor<T>,
-                                        funcs::InverseDivideFunctor<T>>(
-          dev_ctx, tmp, y, ddout, axis);
-      auto& place = *dev_ctx.eigen_device();
-      auto ddout_result = phi::EigenVector<T>::Flatten(*ddout);
-      ddout_result.device(place) = static_cast<T>(-1) * ddout_result;
+      DivDoubleDDoutCompute<Context, T, DivDoubleDDOut_Only_DDY<T>, T>(
+          dev_ctx,
+          *ddx_tensor,
+          *ddy_tensor,
+          y,
+          out,
+          axis,
+          ddout,
+          DivDoubleDDOut_Only_DDY<T>());
     } else {
-      funcs::DefaultElementwiseOperator<Context,
-                                        T,
-                                        funcs::MultiplyFunctor<T>,
-                                        funcs::InverseMultiplyFunctor<T>>(
-          dev_ctx, out, *ddy_tensor, &tmp, axis);
-      funcs::DefaultElementwiseOperator<Context,
-                                        T,
-                                        funcs::SubtractFunctor<T>,
-                                        funcs::InverseSubtractFunctor<T>>(
-          dev_ctx, *ddx_tensor, tmp, &tmp, axis);
-      funcs::DefaultElementwiseOperator<Context,
-                                        T,
-                                        funcs::DivideFunctor<T>,
-                                        funcs::InverseDivideFunctor<T>>(
-          dev_ctx, tmp, y, ddout, axis);
+      DivDoubleDDoutCompute<Context, T, DivDoubleDDOut<T>, T>(
+          dev_ctx,
+          *ddx_tensor,
+          *ddy_tensor,
+          y,
+          out,
+          axis,
+          ddout,
+          DivDoubleDDOut<T>());
     }
   }
 
diff --git a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
index 8132b3b4bd295..257fa0289ef2a 100644
--- a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
+++ b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
@@ -41,17 +41,16 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-class TestElementwiseDivGradGradWithoutDout
-    : public TestElementwiseOpGradGrad<T> {
+class TestElementwiseDivGradGradWithDout : public TestElementwiseOpGradGrad<T> {
  public:
-  TestElementwiseDivGradGradWithoutDout(const platform::Place &place,
-                                        const framework::DDim &dims)
+  TestElementwiseDivGradGradWithDout(const platform::Place &place,
+                                     const framework::DDim &dims)
       : TestElementwiseOpGradGrad<T>(
             "elementwise_div_grad_grad",
             place,
             dims,
             {"Y", "Out", "Out@GRAD", "DDX", "DDY", "DX"},
-            {"Y@GRAD", "DDOut"}) {}
+            {"Y@GRAD", "DDOut", "DOut"}) {}
 
   using TestElementwiseOpGradGrad<T>::feed_datas_;
   using TestElementwiseOpGradGrad<T>::expected_outs_;
@@ -60,6 +59,7 @@ class TestElementwiseDivGradGradWithoutDout
     size_t numel = static_cast<size_t>(common::product(dims_));
     std::vector<T> dy(numel);
     std::vector<T> ddout(numel);
+    std::vector<T> dout(numel);
     for (size_t i = 0; i < numel; ++i) {
       // dY(Y@GRAD) = Out * dX * ddY / Y - dX * ddX / Y
       dy[i] = (feed_datas_["DX"][i] / feed_datas_["Y"][i]) *
@@ -69,9 +69,12 @@ class TestElementwiseDivGradGradWithoutDout
       ddout[i] = (feed_datas_["DDX"][i] -
                   feed_datas_["Out"][i] * feed_datas_["DDY"][i]) /
                  (feed_datas_["Y"][i]);
+      // dOut = - DX * DDy
+      dout[i] = -feed_datas_["DX"][i] * feed_datas_["DDY"][i];
     }
     expected_outs_["Y@GRAD"] = dy;
     expected_outs_["DDOut"] = ddout;
+    expected_outs_["DOut"] = dout;
   }
 
   std::unique_ptr<framework::OperatorBase> CreateTestOp() override {
@@ -83,27 +86,96 @@ class TestElementwiseDivGradGradWithoutDout
          {"DDX", {"DDX"}},
          {"DDY", {"DDY"}},
          {"DX", {"DX"}}},
-        {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}},
+        {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}, {"DOut", {"DOut"}}},
         {{"use_mkldnn", false}, {"axis", 0}});
     return op;
   }
 };
 
-TEST(test_elementwise_div_grad_grad_without_dout, cpu_place) {
+TEST(test_elementwise_div_grad_grad, cpu_place) {
   framework::DDim dims({32, 64});
   platform::CPUPlace p;
-  TestElementwiseDivGradGradWithoutDout<float> test(p, dims);
+  TestElementwiseDivGradGradWithDout<float> test(p, dims);
   ASSERT_TRUE(test.Check());
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-TEST(test_elementwise_div_grad_grad_without_dout, gpu_place) {
+TEST(test_elementwise_div_grad_grad, gpu_place) {
   framework::DDim dims({32, 64});
   platform::CUDAPlace p(0);
-  TestElementwiseDivGradGradWithoutDout<float> test(p, dims);
+  TestElementwiseDivGradGradWithDout<float> test(p, dims);
   ASSERT_TRUE(test.Check());
 }
 #endif
 
+// template <typename T>
+// class TestElementwiseDivGradGradWithoutDX
+//     : public TestElementwiseOpGradGrad<T> {
+//  public:
+//   TestElementwiseDivGradGradWithoutDX(const platform::Place &place,
+//                                         const framework::DDim &dims)
+//       : TestElementwiseOpGradGrad<T>(
+//             "elementwise_div_grad_grad",
+//             place,
+//             dims,
+//             {"Y", "Out", "Out@GRAD", "DDX", "DDY"},
+//             {"Y@GRAD", "DDOut", "DOut"}) {}
+
+//   using TestElementwiseOpGradGrad<T>::feed_datas_;
+//   using TestElementwiseOpGradGrad<T>::expected_outs_;
+//   using TestElementwiseOpGradGrad<T>::dims_;
+//   void ComputeExpectedOuts() override {
+//     size_t numel = static_cast<size_t>(common::product(dims_));
+//     std::vector<T> dy(numel);
+//     std::vector<T> ddout(numel);
+//     std::vector<T> dout(numel);
+//     for (size_t i = 0; i < numel; ++i) {
+//       // dY(Y@GRAD) = Out * dX * ddY / Y - dX * ddX / Y
+//       auto dx = feed_datas_["Out@GRAD"][i] / feed_datas_["Y"][i];
+//       dy[i] = (dx / feed_datas_["Y"][i]) *
+//               (feed_datas_["Out"][i] * feed_datas_["DDY"][i] -
+//                feed_datas_["DDX"][i]);
+//       // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
+//       ddout[i] = (feed_datas_["DDX"][i] -
+//                   feed_datas_["Out"][i] * feed_datas_["DDY"][i]) /
+//                  (feed_datas_["Y"][i]);
+//       // dOut = - DX * DDy
+//       dout[i] = -dx * feed_datas_["DDY"][i];
+//     }
+//     expected_outs_["Y@GRAD"] = dy;
+//     expected_outs_["DDOut"] = ddout;
+//     expected_outs_["DOut"] = dout;
+//   }
+
+//   std::unique_ptr<framework::OperatorBase> CreateTestOp() override {
+//     auto op = framework::OpRegistry::CreateOp(
+//         this->op_type_,
+//         {{"Y", {"Y"}},
+//          {"Out", {"Out"}},
+//          {"Out@GRAD", {"Out@GRAD"}},
+//          {"DDX", {"DDX"}},
+//          {"DDY", {"DDY"}}},
+//         {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}, {"DOut", {"DOut"}}},
+//         {{"use_mkldnn", false}, {"axis", 0}});
+//     return op;
+//   }
+// };
+
+// TEST(test_elementwise_div_grad_grad_without_dx, cpu_place) {
+//   framework::DDim dims({32, 64});
+//   platform::CPUPlace p;
+//   TestElementwiseDivGradGradWithoutDX<float> test(p, dims);
+//   ASSERT_TRUE(test.Check());
+// }
+
+// #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// TEST(test_elementwise_div_grad_grad_without_dx, gpu_place) {
+//   framework::DDim dims({32, 64});
+//   platform::CUDAPlace p(0);
+//   TestElementwiseDivGradGradWithoutDX<float> test(p, dims);
+//   ASSERT_TRUE(test.Check());
+// }
+// #endif
+
 }  // namespace operators
 }  // namespace paddle

From 0f8b0f446f9d3a3509a6e4835972a39c96a88b00 Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Wed, 27 Mar 2024 06:24:04 +0000
Subject: [PATCH 12/16] update device

---
 paddle/phi/kernels/funcs/common_shape.h       |  6 +-
 .../impl/elementwise_grad_kernel_impl.h       | 61 +++++++++++++++-
 .../test_elementwise_div_grad_grad.cc         | 69 -------------------
 3 files changed, 62 insertions(+), 74 deletions(-)

diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h
index 19f2fa1f2fac4..f148589262884 100644
--- a/paddle/phi/kernels/funcs/common_shape.h
+++ b/paddle/phi/kernels/funcs/common_shape.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "glog/logging.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
@@ -52,7 +53,7 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
           "Axis should be less than or equal to %d, but received axis is %d.",
           max_dim,
           axis));
-
+  VLOG(4) << "start get shape\n";
   if (x_dims.size() > y_dims.size()) {
     std::fill(y_dims_array, y_dims_array + axis, 1);
     if (axis + y_dims.size() < max_dim) {
@@ -68,7 +69,7 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
     std::copy(x_dims.Get(), x_dims.Get() + x_dims.size(), x_dims_array + axis);
     std::copy(y_dims.Get(), y_dims.Get() + y_dims.size(), y_dims_array);
   }
-
+  VLOG(4) << "start for\n";
   for (int i = 0; i < max_dim; ++i) {
     PADDLE_ENFORCE_EQ(
         x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 ||
@@ -91,6 +92,7 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
       out_dims_array[i] = -1;
     }
   }
+  VLOG(4) << "end\n";
 }
 
 inline void GetPrePostNumel(
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 2ead7cd004428..85e54625e60bd 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -246,7 +246,7 @@ void ComputeDDoutWithBroadcast(const CPUContext& dev_ctx UNUSED,
   }
 }
 
-#if defined(__NVCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 
 template <typename T, typename DDout_OP, typename OutType = T>
 __global__ void ComputeDDoutWithoutBroadcastGPUKernel(const T* ddx_data,
@@ -337,25 +337,46 @@ void ComputeDDoutWithBroadcast(const GPUContext& dev_ctx UNUSED,
   DenseTensor x_dims_array_gpu;
   x_dims_array_gpu.Resize({max_dim});
   int* x_dims_array_gpu_data = dev_ctx.template Alloc<int>(&x_dims_array_gpu);
+#if defined(__NVCC__)
   cudaMemcpy(x_dims_array_gpu_data,
              x_dims_array,
              sizeof(int) * max_dim,
              cudaMemcpyHostToDevice);
+#else
+  hipMemcpy(x_dims_array_gpu_data,
+            x_dims_array,
+            sizeof(int) * max_dim,
+            hipMemcpyHostToDevice);
+#endif
   DenseTensor y_dims_array_gpu;
   y_dims_array_gpu.Resize({max_dim});
   int* y_dims_array_gpu_data = dev_ctx.template Alloc<int>(&y_dims_array_gpu);
+#if defined(__NVCC__)
   cudaMemcpy(y_dims_array_gpu_data,
              y_dims_array,
              sizeof(int) * max_dim,
              cudaMemcpyHostToDevice);
+#else
+  hipMemcpy(y_dims_array_gpu_data,
+            y_dims_array,
+            sizeof(int) * max_dim,
+            hipMemcpyHostToDevice);
+#endif
   DenseTensor out_dims_array_gpu;
   out_dims_array_gpu.Resize({max_dim});
   int* out_dims_array_gpu_data =
       dev_ctx.template Alloc<int>(&out_dims_array_gpu);
+#if defined(__NVCC__)
   cudaMemcpy(out_dims_array_gpu_data,
              out_dims_array,
              sizeof(int) * max_dim,
              cudaMemcpyHostToDevice);
+#else
+  hipMemcpy(out_dims_array_gpu_data,
+            out_dims_array,
+            sizeof(int) * max_dim,
+            hipMemcpyHostToDevice);
+#endif
   int block = 512;
   int64_t grid = (out_numel + block - 1) / block;
   auto stream = reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
@@ -543,17 +564,50 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                                         funcs::InverseDivideFunctor<T>>(
           dev_ctx, *ddx_tensor, y, ddout, axis);
     } else if (!ddx_tensor && ddy_tensor) {
-      // ddOut = - Out * ddY / Y
+// ddOut = - Out * ddY / Y
+#if defined(__xpu__)
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, out, *ddy_tensor, &tmp, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::DivideFunctor<T>,
+                                        funcs::InverseDivideFunctor<T>>(
+          dev_ctx, tmp, y, ddout, axis);
+      auto& place = *dev_ctx.eigen_device();
+      auto ddout_result = phi::EigenVector<T>::Flatten(*ddout);
+      ddout_result.device(place) = static_cast<T>(-1) * ddout_result;
+#else
       DivDoubleDDoutCompute<Context, T, DivDoubleDDOut_Only_DDY<T>, T>(
           dev_ctx,
-          *ddx_tensor,
+          *dx_tensor,
           *ddy_tensor,
           y,
           out,
           axis,
           ddout,
           DivDoubleDDOut_Only_DDY<T>());
+#endif
     } else {
+#if defined(__xpu__)
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, out, *ddy_tensor, &tmp, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::SubtractFunctor<T>,
+                                        funcs::InverseSubtractFunctor<T>>(
+          dev_ctx, *ddx_tensor, tmp, &tmp, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::DivideFunctor<T>,
+                                        funcs::InverseDivideFunctor<T>>(
+          dev_ctx, tmp, y, ddout, axis);
+#else
       DivDoubleDDoutCompute<Context, T, DivDoubleDDOut<T>, T>(
           dev_ctx,
           *ddx_tensor,
@@ -563,6 +617,7 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
           axis,
           ddout,
           DivDoubleDDOut<T>());
+#endif
     }
   }
 
diff --git a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
index 257fa0289ef2a..a29cc2ea43f7c 100644
--- a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
+++ b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
@@ -108,74 +108,5 @@ TEST(test_elementwise_div_grad_grad, gpu_place) {
 }
 #endif
 
-// template <typename T>
-// class TestElementwiseDivGradGradWithoutDX
-//     : public TestElementwiseOpGradGrad<T> {
-//  public:
-//   TestElementwiseDivGradGradWithoutDX(const platform::Place &place,
-//                                         const framework::DDim &dims)
-//       : TestElementwiseOpGradGrad<T>(
-//             "elementwise_div_grad_grad",
-//             place,
-//             dims,
-//             {"Y", "Out", "Out@GRAD", "DDX", "DDY"},
-//             {"Y@GRAD", "DDOut", "DOut"}) {}
-
-//   using TestElementwiseOpGradGrad<T>::feed_datas_;
-//   using TestElementwiseOpGradGrad<T>::expected_outs_;
-//   using TestElementwiseOpGradGrad<T>::dims_;
-//   void ComputeExpectedOuts() override {
-//     size_t numel = static_cast<size_t>(common::product(dims_));
-//     std::vector<T> dy(numel);
-//     std::vector<T> ddout(numel);
-//     std::vector<T> dout(numel);
-//     for (size_t i = 0; i < numel; ++i) {
-//       // dY(Y@GRAD) = Out * dX * ddY / Y - dX * ddX / Y
-//       auto dx = feed_datas_["Out@GRAD"][i] / feed_datas_["Y"][i];
-//       dy[i] = (dx / feed_datas_["Y"][i]) *
-//               (feed_datas_["Out"][i] * feed_datas_["DDY"][i] -
-//                feed_datas_["DDX"][i]);
-//       // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-//       ddout[i] = (feed_datas_["DDX"][i] -
-//                   feed_datas_["Out"][i] * feed_datas_["DDY"][i]) /
-//                  (feed_datas_["Y"][i]);
-//       // dOut = - DX * DDy
-//       dout[i] = -dx * feed_datas_["DDY"][i];
-//     }
-//     expected_outs_["Y@GRAD"] = dy;
-//     expected_outs_["DDOut"] = ddout;
-//     expected_outs_["DOut"] = dout;
-//   }
-
-//   std::unique_ptr<framework::OperatorBase> CreateTestOp() override {
-//     auto op = framework::OpRegistry::CreateOp(
-//         this->op_type_,
-//         {{"Y", {"Y"}},
-//          {"Out", {"Out"}},
-//          {"Out@GRAD", {"Out@GRAD"}},
-//          {"DDX", {"DDX"}},
-//          {"DDY", {"DDY"}}},
-//         {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}, {"DOut", {"DOut"}}},
-//         {{"use_mkldnn", false}, {"axis", 0}});
-//     return op;
-//   }
-// };
-
-// TEST(test_elementwise_div_grad_grad_without_dx, cpu_place) {
-//   framework::DDim dims({32, 64});
-//   platform::CPUPlace p;
-//   TestElementwiseDivGradGradWithoutDX<float> test(p, dims);
-//   ASSERT_TRUE(test.Check());
-// }
-
-// #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-// TEST(test_elementwise_div_grad_grad_without_dx, gpu_place) {
-//   framework::DDim dims({32, 64});
-//   platform::CUDAPlace p(0);
-//   TestElementwiseDivGradGradWithoutDX<float> test(p, dims);
-//   ASSERT_TRUE(test.Check());
-// }
-// #endif
-
 }  // namespace operators
 }  // namespace paddle

From 99e52810e277845e93bbd54ff329c20c165466f8 Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Wed, 27 Mar 2024 06:27:35 +0000
Subject: [PATCH 13/16] add constant

---
 .../phi/kernels/impl/elementwise_grad_kernel_impl.h  | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 85e54625e60bd..899df8710c0ba 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -165,12 +165,20 @@ struct DivDoubleDY {
 
 template <typename T>
 struct DivDoubleDY_Only_DDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -x * dout; }
+  HOSTDEVICE T operator()(const T& x,
+                          const T& y,
+                          const T& out,
+                          const T& dout) const {
+    return -x * dout;
+  }
 };
 
 template <typename T>
 struct DivDoubleDY_Only_DDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+  HOSTDEVICE T operator()(const T& x,
+                          const T& y,
+                          const T& out,
+                          const T& dout) const {
     return y * out * dout;
   }
 };

From 8016cf502fc8086086717316f0f5347698c3cf42 Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Wed, 27 Mar 2024 06:28:42 +0000
Subject: [PATCH 14/16] update

---
 paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 899df8710c0ba..0ece81a3d2384 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -158,18 +158,11 @@ struct DivGradDY<phi::dtype::complex<T>> {
 
 template <typename T>
 struct DivDoubleDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return (y * out - x) * dout;
-  }
-};
-
-template <typename T>
-struct DivDoubleDY_Only_DDX {
   HOSTDEVICE T operator()(const T& x,
                           const T& y,
                           const T& out,
                           const T& dout) const {
-    return -x * dout;
+    return (y * out - x) * dout;
   }
 };
 

From a718d8e2efda1fd68b327f476738a0c3a3721b9d Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Wed, 27 Mar 2024 12:01:46 +0000
Subject: [PATCH 15/16] fix bug

---
 paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 0ece81a3d2384..4bd0ede6dc827 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -176,6 +176,16 @@ struct DivDoubleDY_Only_DDY {
   }
 };
 
+template <typename T>
+struct DivDoubleDY_Only_DDX {
+  HOSTDEVICE T operator()(const T& x,
+                          const T& y,
+                          const T& out,
+                          const T& dout) const {
+    return -x * dout;
+  }
+};
+
 // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
 template <typename T>
 struct DivDoubleDDOut {

From 156198a72aadbf474301f175e400db5ef81ae6ec Mon Sep 17 00:00:00 2001
From: YibinLiu666 <2632839426@qq.com>
Date: Sat, 30 Mar 2024 15:31:09 +0000
Subject: [PATCH 16/16] remove vlog

---
 paddle/phi/kernels/funcs/common_shape.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h
index f148589262884..45a1024339ba3 100644
--- a/paddle/phi/kernels/funcs/common_shape.h
+++ b/paddle/phi/kernels/funcs/common_shape.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "glog/logging.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
@@ -53,7 +52,6 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
           "Axis should be less than or equal to %d, but received axis is %d.",
           max_dim,
           axis));
-  VLOG(4) << "start get shape\n";
   if (x_dims.size() > y_dims.size()) {
     std::fill(y_dims_array, y_dims_array + axis, 1);
     if (axis + y_dims.size() < max_dim) {
@@ -69,7 +67,6 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
     std::copy(x_dims.Get(), x_dims.Get() + x_dims.size(), x_dims_array + axis);
     std::copy(y_dims.Get(), y_dims.Get() + y_dims.size(), y_dims_array);
   }
-  VLOG(4) << "start for\n";
   for (int i = 0; i < max_dim; ++i) {
     PADDLE_ENFORCE_EQ(
         x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 ||
@@ -92,7 +89,6 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
       out_dims_array[i] = -1;
     }
   }
-  VLOG(4) << "end\n";
 }
 
 inline void GetPrePostNumel(