From d99c0ef748182a0c28646a78292143ed332567c2 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Mon, 6 May 2024 16:58:20 +0800
Subject: [PATCH 1/5] Fix

---
 paddle/phi/api/yaml/ops.yaml                  |  48 ++++++
 .../legacy/cpu/fused_elementwise_kernel.cc    | 153 ++++++++++++++++++
 2 files changed, 201 insertions(+)
 create mode 100644 paddle/phi/kernels/legacy/cpu/fused_elementwise_kernel.cc

diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 98da34dd2d442..08713e4bfa432 100755
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1227,6 +1227,54 @@
     backend : place
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
+- op : fused_elementwise_add
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_add
+    data_type : x
+
+- op : fused_elementwise_div
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_div
+    data_type : x
+
+- op : fused_elementwise_mul
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_mul
+    data_type : x
+
+- op : fused_elementwise_sub
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_sub
+    data_type : x
+
 - op : gammaincc
   args : (Tensor x, Tensor y)
   output : Tensor(out)
diff --git a/paddle/phi/kernels/legacy/cpu/fused_elementwise_kernel.cc b/paddle/phi/kernels/legacy/cpu/fused_elementwise_kernel.cc
new file mode 100644
index 0000000000000..f8c19b3a3f9ad
--- /dev/null
+++ b/paddle/phi/kernels/legacy/cpu/fused_elementwise_kernel.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+#include "paddle/phi/kernels/legacy/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/legacy/elementwise_divide_kernel.h"
+#include "paddle/phi/kernels/legacy/elementwise_multipy_kernel.h"
+#include "paddle/phi/kernels/legacy/elementwise_subtract_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void FusedElementwiseAddKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               int axis,
+                               const std::string& fuse_activation UNUSED,
+                               float fuse_alpha UNUSED,
+                               float fuse_beta UNUSED,
+                               float fused_output_scale UNUSED,
+                               const std::vector<int>& fused_unsqueeze2_axes
+                                   UNUSED,
+                               float scale_x UNUSED,
+                               float scale_y UNUSED,
+                               float scale_out UNUSED,
+                               DenseTensor* out) {
+  AddRawKernel<T, Context>(dev_ctx, x, y, axis, out);
+}
+
+template <typename T, typename Context>
+void FusedElementwiseDivKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               int axis,
+                               const std::string& fuse_activation UNUSED,
+                               float fuse_alpha UNUSED,
+                               float fuse_beta UNUSED,
+                               float fused_output_scale UNUSED,
+                               const std::vector<int>& fused_unsqueeze2_axes
+                                   UNUSED,
+                               float scale_x UNUSED,
+                               float scale_y UNUSED,
+                               float scale_out UNUSED,
+                               DenseTensor* out) {
+  DivideRawKernel<T, Context>(dev_ctx, x, y, axis, out);
+}
+
+template <typename T, typename Context>
+void FusedElementwiseMulKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               int axis,
+                               const std::string& fuse_activation UNUSED,
+                               float fuse_alpha UNUSED,
+                               float fuse_beta UNUSED,
+                               float fused_output_scale UNUSED,
+                               const std::vector<int>& fused_unsqueeze2_axes
+                                   UNUSED,
+                               float scale_x UNUSED,
+                               float scale_y UNUSED,
+                               float scale_out UNUSED,
+                               DenseTensor* out) {
+  MultiplyRawKernel<T, Context>(dev_ctx, x, y, axis, out);
+}
+
+template <typename T, typename Context>
+void FusedElementwiseSubKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               int axis,
+                               const std::string& fuse_activation UNUSED,
+                               float fuse_alpha UNUSED,
+                               float fuse_beta UNUSED,
+                               float fused_output_scale UNUSED,
+                               const std::vector<int>& fused_unsqueeze2_axes
+                                   UNUSED,
+                               float scale_x UNUSED,
+                               float scale_y UNUSED,
+                               float scale_out UNUSED,
+                               DenseTensor* out) {
+  SubtractRawKernel<T, Context>(dev_ctx, x, y, axis, out);
+}
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(fused_elementwise_add,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FusedElementwiseAddKernel,
+                   float,
+                   double,
+                   int,
+                   bool,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+
+PD_REGISTER_KERNEL(fused_elementwise_div,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FusedElementwiseDivKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   complex64,
+                   complex128) {}
+
+PD_REGISTER_KERNEL(fused_elementwise_mul,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FusedElementwiseMulKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(fused_elementwise_sub,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FusedElementwiseSubKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}

From 4e0a13fdfa3a18d713cd5787f5acc35eb010ae29 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Mon, 6 May 2024 18:51:51 +0800
Subject: [PATCH 2/5] Fix

---
 .../operators/fused/fused_elementwise_op.cc   | 95 -------------------
 .../ops_signature/fused_elementwise_sig.cc    | 92 ------------------
 paddle/phi/api/yaml/legacy_ops.yaml           | 48 ++++++++++
 paddle/phi/api/yaml/ops.yaml                  | 48 ----------
 4 files changed, 48 insertions(+), 235 deletions(-)
 delete mode 100644 paddle/fluid/operators/fused/fused_elementwise_op.cc
 delete mode 100644 paddle/fluid/operators/ops_signature/fused_elementwise_sig.cc

diff --git a/paddle/fluid/operators/fused/fused_elementwise_op.cc b/paddle/fluid/operators/fused/fused_elementwise_op.cc
deleted file mode 100644
index e6c2743e9385d..0000000000000
--- a/paddle/fluid/operators/fused/fused_elementwise_op.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class FusedElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final {
-    AddInput("X", "The first input tensor of elementwise op.");
-    AddInput("Y", "The second input tensor of elementwise op.");
-    AddOutput("Out", "A location into which the result is stored.");
-    AddAttr<int>(
-        "axis",
-        "If X.dimension != Y.dimension, Y.dimension must be a "
-        "subsequence of X.dimension. And axis is the start dimension index "
-        "for broadcasting Y onto X.")
-        .SetDefault(-1);
-    AddAttr<std::string>(
-        "fuse_activation",
-        "Activation type from elementwise_act_onednn_fuse_pass")
-        .SetDefault("");
-    AddAttr<float>("fuse_alpha",
-                   "Activation alpha from elementwise_act_onednn_fuse_pass")
-        .SetDefault(0.0f);
-    AddAttr<float>("fuse_beta",
-                   "Activation beta from elementwise_act_onednn_fuse_pass")
-        .SetDefault(0.0f);
-    AddAttr<float>("fused_output_scale",
-                   "Obtained from operator_scale_onednn_fuse_pass")
-        .SetDefault(1.0f);
-    AddAttr<std::vector<int>>(
-        "fused_unsqueeze2_axes",
-        "Obtained from operator_unsqueeze2_onednn_fuse_pass")
-        .SetDefault({});
-    AddAttr<float>("scale_x", "Elementwise X input quantization scale")
-        .SetDefault(1.0f);
-    AddAttr<float>("scale_y", "Elementwise Y input quantization scale")
-        .SetDefault(1.0f);
-    AddAttr<float>("scale_out", "Elementwise Out output quantization scale")
-        .SetDefault(1.0f);
-    AddComment(
-        R"DOC(Elementwise operator extended with oneDNN-specific fusion logic.)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    fused_elementwise_add,
-    ops::ElementwiseOp,
-    ops::FusedElementwiseOpMaker,
-    ops::ElementwiseOpInferVarType,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(
-    fused_elementwise_sub,
-    ops::ElementwiseOp,
-    ops::FusedElementwiseOpMaker,
-    ops::ElementwiseOpInferVarType,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(
-    fused_elementwise_mul,
-    ops::ElementwiseOp,
-    ops::FusedElementwiseOpMaker,
-    ops::ElementwiseOpInferVarType,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(
-    fused_elementwise_div,
-    ops::ElementwiseOp,
-    ops::FusedElementwiseOpMaker,
-    ops::ElementwiseOpInferVarType,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/ops_signature/fused_elementwise_sig.cc b/paddle/fluid/operators/ops_signature/fused_elementwise_sig.cc
deleted file mode 100644
index 34e0bfd314fd6..0000000000000
--- a/paddle/fluid/operators/ops_signature/fused_elementwise_sig.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature FusedElementwiseAddOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("fused_elementwise_add",
-                         {"X", "Y"},
-                         {"axis",
-                          "fuse_activation",
-                          "fuse_alpha",
-                          "fuse_beta",
-                          "fused_output_scale",
-                          "fused_unsqueeze2_axes",
-                          "scale_x",
-                          "scale_y",
-                          "scale_out"},
-                         {"Out"});
-}
-
-KernelSignature FusedElementwiseSubOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("fused_elementwise_sub",
-                         {"X", "Y"},
-                         {"axis",
-                          "fuse_activation",
-                          "fuse_alpha",
-                          "fuse_beta",
-                          "fused_output_scale",
-                          "fused_unsqueeze2_axes",
-                          "scale_x",
-                          "scale_y",
-                          "scale_out"},
-                         {"Out"});
-}
-
-KernelSignature FusedElementwiseMulOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature("fused_elementwise_mul",
-                         {"X", "Y"},
-                         {"axis",
-                          "fuse_activation",
-                          "fuse_alpha",
-                          "fuse_beta",
-                          "fused_output_scale",
-                          "fused_unsqueeze2_axes",
-                          "scale_x",
-                          "scale_y",
-                          "scale_out"},
-                         {"Out"});
-}
-
-KernelSignature FusedElementwiseDivOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature("fused_elementwise_div",
-                         {"X", "Y"},
-                         {"axis",
-                          "fuse_activation",
-                          "fuse_alpha",
-                          "fuse_beta",
-                          "fused_output_scale",
-                          "fused_unsqueeze2_axes",
-                          "scale_x",
-                          "scale_y",
-                          "scale_out"},
-                         {"Out"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(fused_elementwise_add,
-                           phi::FusedElementwiseAddOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(fused_elementwise_sub,
-                           phi::FusedElementwiseSubOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(fused_elementwise_mul,
-                           phi::FusedElementwiseMulOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(fused_elementwise_div,
-                           phi::FusedElementwiseDivOpArgumentMapping);
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 6260945b48971..dd73030642fbe 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -603,6 +603,54 @@
   view : (mean -> mean_out), (variance -> variance_out)
   backward : fused_bn_add_activation_grad
 
+- op : fused_elementwise_add
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_add
+    data_type : x
+
+- op : fused_elementwise_div
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_div
+    data_type : x
+
+- op : fused_elementwise_mul
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_mul
+    data_type : x
+
+- op : fused_elementwise_sub
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_sub
+    data_type : x
+
 - op : fused_gemm_epilogue
   args : (Tensor x, Tensor y, Tensor bias, bool trans_x, bool trans_y, str activation)
   output : Tensor(out), Tensor(reserve_space)
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 08713e4bfa432..98da34dd2d442 100755
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1227,54 +1227,6 @@
     backend : place
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
-- op : fused_elementwise_add
-  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
-    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
-    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
-  output: Tensor (out)
-  infer_meta:
-    func: ElementwiseInferMeta
-    param : [x, y]
-  kernel :
-    func : fused_elementwise_add
-    data_type : x
-
-- op : fused_elementwise_div
-  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
-    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
-    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
-  output: Tensor (out)
-  infer_meta:
-    func: ElementwiseInferMeta
-    param : [x, y]
-  kernel :
-    func : fused_elementwise_div
-    data_type : x
-
-- op : fused_elementwise_mul
-  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
-    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
-    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
-  output: Tensor (out)
-  infer_meta:
-    func: ElementwiseInferMeta
-    param : [x, y]
-  kernel :
-    func : fused_elementwise_mul
-    data_type : x
-
-- op : fused_elementwise_sub
-  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
-    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
-    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
-  output: Tensor (out)
-  infer_meta:
-    func: ElementwiseInferMeta
-    param : [x, y]
-  kernel :
-    func : fused_elementwise_sub
-    data_type : x
-
 - op : gammaincc
   args : (Tensor x, Tensor y)
   output : Tensor(out)

From 5998e2b8879f08040569fc81733e54a8e8165a9e Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Mon, 6 May 2024 18:56:17 +0800
Subject: [PATCH 3/5] Fix

---
 paddle/phi/api/yaml/legacy_ops.yaml | 48 -----------------------------
 paddle/phi/api/yaml/ops.yaml        | 48 +++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index dd73030642fbe..6260945b48971 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -603,54 +603,6 @@
   view : (mean -> mean_out), (variance -> variance_out)
   backward : fused_bn_add_activation_grad
 
-- op : fused_elementwise_add
-  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
-    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
-    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
-  output: Tensor (out)
-  infer_meta:
-    func: ElementwiseInferMeta
-    param : [x, y]
-  kernel :
-    func : fused_elementwise_add
-    data_type : x
-
-- op : fused_elementwise_div
-  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
-    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
-    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
-  output: Tensor (out)
-  infer_meta:
-    func: ElementwiseInferMeta
-    param : [x, y]
-  kernel :
-    func : fused_elementwise_div
-    data_type : x
-
-- op : fused_elementwise_mul
-  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
-    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
-    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
-  output: Tensor (out)
-  infer_meta:
-    func: ElementwiseInferMeta
-    param : [x, y]
-  kernel :
-    func : fused_elementwise_mul
-    data_type : x
-
-- op : fused_elementwise_sub
-  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
-    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
-    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
-  output: Tensor (out)
-  infer_meta:
-    func: ElementwiseInferMeta
-    param : [x, y]
-  kernel :
-    func : fused_elementwise_sub
-    data_type : x
-
 - op : fused_gemm_epilogue
   args : (Tensor x, Tensor y, Tensor bias, bool trans_x, bool trans_y, str activation)
   output : Tensor(out), Tensor(reserve_space)
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 98da34dd2d442..08713e4bfa432 100755
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1227,6 +1227,54 @@
     backend : place
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
+- op : fused_elementwise_add
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_add
+    data_type : x
+
+- op : fused_elementwise_div
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_div
+    data_type : x
+
+- op : fused_elementwise_mul
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_mul
+    data_type : x
+
+- op : fused_elementwise_sub
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_sub
+    data_type : x
+
 - op : gammaincc
   args : (Tensor x, Tensor y)
   output : Tensor(out)

From 6cf1b06286d2903c6f86facdad97acbd30ff0a4e Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Tue, 7 May 2024 19:04:57 +0800
Subject: [PATCH 4/5] Fix

---
 paddle/phi/api/yaml/fused_ops.yaml            | 48 +++++++++++++++++++
 paddle/phi/api/yaml/ops.yaml                  | 48 -------------------
 .../legacy/cpu/fused_elementwise_kernel.cc    |  3 +-
 3 files changed, 49 insertions(+), 50 deletions(-)

diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
index 304c543d1a463..597b007b107d0 100644
--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
@@ -232,6 +232,54 @@
   backward : fused_dropout_add_grad
   support_dygraph_mode : true
 
+- op : fused_elementwise_add
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_add
+    data_type : x
+
+- op : fused_elementwise_div
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_div
+    data_type : x
+
+- op : fused_elementwise_mul
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_mul
+    data_type : x
+
+- op : fused_elementwise_sub
+  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
+    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
+    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
+  output: Tensor (out)
+  infer_meta:
+    func: ElementwiseInferMeta
+    param : [x, y]
+  kernel :
+    func : fused_elementwise_sub
+    data_type : x
+
 - op : fused_embedding_eltwise_layernorm
   args : (Tensor[] ids, Tensor[] embs, Tensor bias, Tensor scale, float epsilon = 0.00001f)
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 08713e4bfa432..98da34dd2d442 100755
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1227,54 +1227,6 @@
     backend : place
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
-- op : fused_elementwise_add
-  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
-    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
-    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
-  output: Tensor (out)
-  infer_meta:
-    func: ElementwiseInferMeta
-    param : [x, y]
-  kernel :
-    func : fused_elementwise_add
-    data_type : x
-
-- op : fused_elementwise_div
-  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
-    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
-    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
-  output: Tensor (out)
-  infer_meta:
-    func: ElementwiseInferMeta
-    param : [x, y]
-  kernel :
-    func : fused_elementwise_div
-    data_type : x
-
-- op : fused_elementwise_mul
-  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
-    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
-    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
-  output: Tensor (out)
-  infer_meta:
-    func: ElementwiseInferMeta
-    param : [x, y]
-  kernel :
-    func : fused_elementwise_mul
-    data_type : x
-
-- op : fused_elementwise_sub
-  args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
-    = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, int[] fused_unsqueeze2_axes
-    = {}, float scale_x = 1.0f, float scale_y = 1.0f, float scale_out = 1.0f)
-  output: Tensor (out)
-  infer_meta:
-    func: ElementwiseInferMeta
-    param : [x, y]
-  kernel :
-    func : fused_elementwise_sub
-    data_type : x
-
 - op : gammaincc
   args : (Tensor x, Tensor y)
   output : Tensor(out)
diff --git a/paddle/phi/kernels/legacy/cpu/fused_elementwise_kernel.cc b/paddle/phi/kernels/legacy/cpu/fused_elementwise_kernel.cc
index f8c19b3a3f9ad..ec640c2257c3f 100644
--- a/paddle/phi/kernels/legacy/cpu/fused_elementwise_kernel.cc
+++ b/paddle/phi/kernels/legacy/cpu/fused_elementwise_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"

From dc0acac734ed1f9ae985ac3524ff8dfeaa02a810 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Thu, 9 May 2024 13:18:23 +0800
Subject: [PATCH 5/5] Fix

---
 paddle/fluid/pir/dialect/op_generator/ops_api_gen.py | 4 ++++
 paddle/phi/api/yaml/fused_ops.yaml                   | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
index 2647b579f2bc7..ef33da14eb6ab 100644
--- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -161,6 +161,10 @@
     'fused_token_prune',
     'fused_dconv_drelu_dbn',
     'fused_dot_product_attention',
+    'fused_elementwise_add',
+    'fused_elementwise_div',
+    'fused_elementwise_mul',
+    'fused_elementwise_sub',
     'nce',
     'lars_momentum',
     'lars_momentum_',
diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
index 597b007b107d0..df4ca1ce124d3 100644
--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
@@ -243,6 +243,7 @@
   kernel :
     func : fused_elementwise_add
     data_type : x
+  support_dygraph_mode : true
 
 - op : fused_elementwise_div
   args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
@@ -255,6 +256,7 @@
   kernel :
     func : fused_elementwise_div
     data_type : x
+  support_dygraph_mode : true
 
 - op : fused_elementwise_mul
   args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
@@ -267,6 +269,7 @@
   kernel :
     func : fused_elementwise_mul
     data_type : x
+  support_dygraph_mode : true
 
 - op : fused_elementwise_sub
   args: (Tensor x, Tensor y, int axis = -1, str fuse_activation = "", float fuse_alpha
@@ -279,6 +282,7 @@
   kernel :
     func : fused_elementwise_sub
     data_type : x
+  support_dygraph_mode : true
 
 - op : fused_embedding_eltwise_layernorm
   args : (Tensor[] ids, Tensor[] embs, Tensor bias, Tensor scale, float epsilon = 0.00001f)