From 6cdaa371be301b9b7a7c34b8f8c45319b0ce70a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9F=A0=E6=AA=AC=E5=91=B3=7E?=
 <93066842+Lemon-er@users.noreply.github.com>
Date: Mon, 5 Dec 2022 11:20:08 +0800
Subject: [PATCH 01/13] DenseTensor (#48419)

---
 .../sequence_ops/sequence_conv_op.cc          | 29 ++++++------
 .../operators/sequence_ops/sequence_conv_op.h | 34 +++++++-------
 .../sequence_ops/sequence_conv_op_xpu.cc      | 32 ++++++-------
 .../sequence_ops/sequence_enumerate_op.cc     |  8 ++--
 .../sequence_ops/sequence_enumerate_op.cu     |  5 +-
 .../sequence_ops/sequence_enumerate_op.h      |  7 ++-
 .../sequence_ops/sequence_erase_op.cc         | 37 ++++++++-------
 .../sequence_ops/sequence_erase_op.cu         |  5 +-
 .../sequence_ops/sequence_expand_as_op.cc     | 23 ++++-----
 .../sequence_ops/sequence_expand_as_op.cu     | 10 ++--
 .../sequence_ops/sequence_expand_op.h         | 34 +++++++-------
 .../sequence_ops/sequence_mask_op.cc          |  2 +-
 .../operators/sequence_ops/sequence_mask_op.h | 14 +++---
 .../sequence_ops/sequence_mask_op_npu.cc      | 12 ++---
 .../operators/sequence_ops/sequence_pad_op.cc | 34 +++++++-------
 .../operators/sequence_ops/sequence_pad_op.h  | 26 +++++-----
 .../sequence_ops/sequence_pool_op.cc          | 17 ++++---
 .../operators/sequence_ops/sequence_pool_op.h | 22 ++++-----
 .../sequence_ops/sequence_softmax_op.h        | 47 +++++++++----------
 19 files changed, 195 insertions(+), 203 deletions(-)

diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
index 7056c52cd8ba8..57669dbcd6a40 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
@@ -145,30 +145,31 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput(
         "X",
-        "(LoDTensor) the input(X) is a LodTensor, which supports "
+        "(phi::DenseTensor) the input(X) is a LodTensor, which supports "
         "variable-time length input sequence. The underlying tensor in "
-        "this LoDTensor is a matrix with shape (T, N), where T is the "
+        "this phi::DenseTensor is a matrix with shape (T, N), where T is the "
         "total time steps in this mini-batch and N is the input_hidden_size.");
-    AddInput("PaddingData",
-             "(Tensor, optional) the input(PaddingData) is an optional "
-             "parameter, and it is learnable. "
-             "This is a tensor with shape (P, N), where P is the "
-             "top_pad + bottom_pad, N is the input_hidden_size. In order to "
-             "ensure the equal length of sequence before and after "
-             "convolution, it is necessary to fill the top and bottom of each "
-             "sequence according to context_length, context_stride and "
-             "context_start")
+    AddInput(
+        "PaddingData",
+        "(phi::DenseTensor, optional) the input(PaddingData) is an optional "
+        "parameter, and it is learnable. "
+        "This is a tensor with shape (P, N), where P is the "
+        "top_pad + bottom_pad, N is the input_hidden_size. In order to "
+        "ensure the equal length of sequence before and after "
+        "convolution, it is necessary to fill the top and bottom of each "
+        "sequence according to context_length, context_stride and "
+        "context_start")
         .AsDispensable();
     AddInput(
         "Filter",
-        "(Tensor) the input(Filter) is an learnable parameter."
+        "(phi::DenseTensor) the input(Filter) is an learnable parameter."
         "This is a tensor with shape (K, M), where K is the "
         "context_length * input_hidden_size, M is the output feature size.");
     AddOutput(
         "Out",
-        "(LoDTensor) the output(Out) is a LodTensor, which support "
+        "(phi::DenseTensor) the output(Out) is a LodTensor, which support "
         "variable-time length output sequence. The underlying tensor in "
-        "this LoDTensor is a matrix with shape (T, M), where, T is the "
+        "this phi::DenseTensor is a matrix with shape (T, M), where, T is the "
         "total time steps in this mini-batch, M is the output feature size.");
 
     AddAttr<bool>("paddingTrainable",
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
index 5dec776c32072..cf34cde478c35 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
@@ -22,15 +22,12 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-using LoDTensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class SequenceConvKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
+    auto* in = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     auto filter = *context.Input<phi::DenseTensor>("Filter");
 
     out->mutable_data<T>(context.GetPlace());
@@ -40,11 +37,11 @@ class SequenceConvKernel : public framework::OpKernel<T> {
     int context_stride = context.Attr<int>("contextStride");
     bool padding_trainable = context.Attr<bool>("paddingTrainable");
 
-    PADDLE_ENFORCE_EQ(
-        in->lod().empty(),
-        false,
-        platform::errors::InvalidArgument("Input(X) Tensor of SequenceConvOp "
-                                          "does not contain LoD information."));
+    PADDLE_ENFORCE_EQ(in->lod().empty(),
+                      false,
+                      platform::errors::InvalidArgument(
+                          "Input(X) phi::DenseTensor of SequenceConvOp "
+                          "does not contain LoD information."));
     PADDLE_ENFORCE_EQ(
         in->lod().size(),
         1UL,
@@ -64,7 +61,7 @@ class SequenceConvKernel : public framework::OpKernel<T> {
 
     framework::DDim col_shape = {in->dims()[0],
                                  context_length * sequence_width};
-    Tensor col;
+    phi::DenseTensor col;
     col.mutable_data<T>(col_shape, context.GetPlace());
     // Because if padding_trainable is false, padding data should be zeros.
     phi::funcs::SetConstant<DeviceContext, T> set_zero;
@@ -92,13 +89,14 @@ template <typename DeviceContext, typename T>
 class SequenceConvGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* in_g = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* out_g =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto* filter_g =
         context.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
     auto* padding_data_g =
         context.Output<phi::DenseTensor>(framework::GradVarName("PaddingData"));
-    auto* in = context.Input<LoDTensor>("X");
+    auto* in = context.Input<phi::DenseTensor>("X");
     auto* filter = context.Input<phi::DenseTensor>("Filter");
 
     int context_start = context.Attr<int>("contextStart");
@@ -125,7 +123,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
     // use col_shape in the im2col calculation
     framework::DDim col_shape = {in->dims()[0],
                                  sequence_width * context_length};
-    Tensor col;
+    phi::DenseTensor col;
 
     if (in_g || filter_g || (padding_trainable && padding_data_g)) {
       col.mutable_data<T>(col_shape, context.GetPlace());
@@ -159,7 +157,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
       padding_data_g->mutable_data<T>(context.GetPlace());
       set_zero(dev_ctx, padding_data_g, static_cast<T>(0));
 
-      LoDTensor* input = const_cast<LoDTensor*>(in);
+      phi::DenseTensor* input = const_cast<phi::DenseTensor*>(in);
       seq_project_grad_functor(dev_ctx,
                                *input,
                                padding_trainable,
@@ -178,8 +176,8 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
       filter_g->mutable_data<T>(context.GetPlace());
       set_zero(dev_ctx, filter_g, static_cast<T>(0));
 
-      Tensor filter_grad = *filter_g;
-      LoDTensor out_grad = *out_g;
+      phi::DenseTensor filter_grad = *filter_g;
+      phi::DenseTensor out_grad = *out_g;
 
       const phi::DenseTensor* padding_data = nullptr;
       if (padding_trainable) {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
index f0083ec4042e6..f7b0b5c3b581a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
@@ -19,14 +19,13 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class SequenceConvXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
+    auto* in = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     auto filter = *context.Input<phi::DenseTensor>("Filter");
 
     out->mutable_data<T>(context.GetPlace());
@@ -36,11 +35,11 @@ class SequenceConvXPUKernel : public framework::OpKernel<T> {
     int context_stride = context.Attr<int>("contextStride");
     bool padding_trainable = context.Attr<bool>("paddingTrainable");
 
-    PADDLE_ENFORCE_EQ(
-        in->lod().empty(),
-        false,
-        platform::errors::InvalidArgument("Input(X) Tensor of SequenceConvOp "
-                                          "does not contain LoD information."));
+    PADDLE_ENFORCE_EQ(in->lod().empty(),
+                      false,
+                      platform::errors::InvalidArgument(
+                          "Input(X) phi::DenseTensor of SequenceConvOp "
+                          "does not contain LoD information."));
     PADDLE_ENFORCE_EQ(
         in->lod().size(),
         1UL,
@@ -159,11 +158,12 @@ template <typename DeviceContext, typename T>
 class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* in_g = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* out_g =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto* filter_g =
         context.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-    auto* in = context.Input<LoDTensor>("X");
+    auto* in = context.Input<phi::DenseTensor>("X");
     auto* filter = context.Input<phi::DenseTensor>("Filter");
 
     int context_start = context.Attr<int>("contextStart");
@@ -171,11 +171,11 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
     int context_stride = context.Attr<int>("contextStride");
     bool padding_trainable = context.Attr<bool>("paddingTrainable");
 
-    PADDLE_ENFORCE_EQ(
-        in->lod().empty(),
-        false,
-        platform::errors::InvalidArgument("Input(X) Tensor of SequenceConvOp "
-                                          "does not contain LoD information."));
+    PADDLE_ENFORCE_EQ(in->lod().empty(),
+                      false,
+                      platform::errors::InvalidArgument(
+                          "Input(X) phi::DenseTensor of SequenceConvOp "
+                          "does not contain LoD information."));
     PADDLE_ENFORCE_EQ(
         in->lod().size(),
         1UL,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
index 337ea46b260e9..979296eb044cc 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
@@ -36,11 +36,11 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "(2-D LoDTensor with the 2nd dimension equal to 1) "
-             "Input LoDTensor of SequenceEnumerate operator.");
+             "(2-D phi::DenseTensor with the 2nd dimension equal to 1) "
+             "Input phi::DenseTensor of SequenceEnumerate operator.");
     AddOutput("Out",
-              "(2-D LoDTensor with the 2nd dimension equal to win_size) "
-              "Output LoDTensor of SequenceEnumerate operator.");
+              "(2-D phi::DenseTensor with the 2nd dimension equal to win_size) "
+              "Output phi::DenseTensor of SequenceEnumerate operator.");
     AddAttr<int>("win_size", "(int) The enumerate sequence window size.")
         .AddCustomChecker([](const int& win_size) {
           PADDLE_ENFORCE_GE(win_size,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
index 0f53f292ef8ae..ee69333f924fe 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
@@ -21,7 +21,6 @@
 namespace paddle {
 namespace operators {
 using phi::PADDLE_CUDA_NUM_THREADS;
-using LoDTensor = phi::DenseTensor;
 
 template <typename T>
 __global__ void CalcOutPut(const T* in_data,
@@ -52,8 +51,8 @@ template <typename T>
 class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
+    auto* in = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     int win_size = context.Attr<int>("win_size");
     int pad_value = context.Attr<int>("pad_value");
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
index 90cb930062621..048f28d85917b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
@@ -18,14 +18,13 @@
 
 namespace paddle {
 namespace operators {
-using LoDTensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class SequenceEnumerateKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
+    auto* in = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     int win_size = context.Attr<int>("win_size");
     auto pad_value = static_cast<T>(context.Attr<int>("pad_value"));
 
@@ -33,7 +32,7 @@ class SequenceEnumerateKernel : public framework::OpKernel<T> {
         in->lod().empty(),
         false,
         platform::errors::InvalidArgument(
-            "Input(X) Tensor of SequenceEnumerateOp does not contain "
+            "Input(X) phi::DenseTensor of SequenceEnumerateOp does not contain "
             "LoD information."));
 
     auto in_dims = in->dims();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
index 2943b8895978f..fe50d8502c0eb 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
@@ -27,20 +27,21 @@ class SequenceEraseOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SequenceErase");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SequenceErase");
     auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(x_dims.size() == 2 && x_dims[1] == 1,
-                   platform::errors::InvalidArgument(
-                       "Input(X) of SequenceEraseOp should be a 2-D LoDTensor "
-                       "with the 2nd dimension equal to 1,"
-                       "but received size %d with the 2nd dimension %d.",
-                       x_dims.size(),
-                       x_dims[1]));
+    PADDLE_ENFORCE(
+        x_dims.size() == 2 && x_dims[1] == 1,
+        platform::errors::InvalidArgument(
+            "Input(X) of SequenceEraseOp should be a 2-D phi::DenseTensor "
+            "with the 2nd dimension equal to 1,"
+            "but received size %d with the 2nd dimension %d.",
+            x_dims.size(),
+            x_dims[1]));
     ctx->SetOutputDim("Out", x_dims);
-    // The output LoDTensor's lod_level should be input X's lod_level.
+    // The output phi::DenseTensor's lod_level should be input X's lod_level.
     // For compile-time, we call SetLoDLevel to set output's lod_level.
-    // For runtime, output LoDTensor's lod is determined by input X's lod and
-    // the level specified by input RandTable.
-    // We cannot get X's detail lod and RankTable's level in this function, so
-    // leave this work to the detail kernel implementation.
+    // For runtime, output phi::DenseTensor's lod is determined by input X's lod
+    // and the level specified by input RandTable. We cannot get X's detail lod
+    // and RankTable's level in this function, so leave this work to the detail
+    // kernel implementation.
     if (!ctx->IsRuntime()) {
       ctx->SetLoDLevel("Out", ctx->GetLoDLevel("X"));
     }
@@ -51,11 +52,11 @@ class SequenceEraseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "(2-D LoDTensor with the 2nd dim. equal to 1) "
-             "Input LoDTensor of SequenceEraseOp.");
+             "(2-D phi::DenseTensor with the 2nd dim. equal to 1) "
+             "Input phi::DenseTensor of SequenceEraseOp.");
     AddOutput("Out",
-              "(2-D LoDTensor with the 2nd dim. equal to 1) "
-              "Output LoDTensor of SequenceEraseOp.");
+              "(2-D phi::DenseTensor with the 2nd dim. equal to 1) "
+              "Output phi::DenseTensor of SequenceEraseOp.");
     AddAttr<std::vector<int>>("tokens",
                               "(vector<int>) Tokens need to be erased from "
                               "input sequences.");
@@ -64,7 +65,7 @@ Sequence Erase Operator.
 
 Sequence erase operator erases tokens specified by Attr(tokens) from the input
 sequences Input(X), and outputs the remaining data and modifies the LoD
-information at the same time. For example, given a 2-D LoDTensor
+information at the same time. For example, given a 2-D phi::DenseTensor
 
     X = [[2, 2, 6, 1, 3, 9, 6, 1, 0, 1]]^T
 
@@ -77,7 +78,7 @@ operation, the three sequences become
 
     X1' = [[6]]^T, X2' = [[1, 9]]^T and X3' = [[6, 1, 0, 1]]^T.
 
-Hence the LoDTensor Output(Out) should be
+Hence the phi::DenseTensor Output(Out) should be
 
     Out = [[6, 1, 9, 6, 1, 0, 1]]^T,
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
index d8b0afbc85dc5..b573df956df1c 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 using phi::PADDLE_CUDA_NUM_THREADS;
-using LoDTensor = phi::DenseTensor;
 
 template <typename T>
 __global__ void LabelErasedIdx(const T* in_dat,
@@ -67,8 +66,8 @@ template <typename T>
 class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<LoDTensor>("X");
-    auto* out = ctx.Output<LoDTensor>("Out");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto lod = in->lod();
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
index aa27516a3356e..b1223618eea0d 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using LoDTensor = phi::DenseTensor;
-
 class SequenceExpandAsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -49,8 +47,8 @@ class SequenceExpandAsOp : public framework::OperatorWithKernel {
       framework::Variable* y_var =
           PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("Y")[0]);
 
-      auto& x_dim = x_var->Get<LoDTensor>().dims();
-      auto& y_lod = y_var->Get<LoDTensor>().lod();
+      auto& x_dim = x_var->Get<phi::DenseTensor>().dims();
+      auto& y_lod = y_var->Get<phi::DenseTensor>().lod();
 
       PADDLE_ENFORCE_EQ(y_lod.size(),
                         1,
@@ -96,13 +94,16 @@ class SequenceExpandAsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor whose lod "
+             "(phi::DenseTensor, default phi::DenseTensor<float>) A 2-D "
+             "phi::DenseTensor whose lod "
              "level is at most 1.");
     AddInput("Y",
-             "(LoDTensor, default LoDTensor<float>) Referred LoDTensor whose "
+             "(phi::DenseTensor, default phi::DenseTensor<float>) Referred "
+             "phi::DenseTensor whose "
              "lod (specified level) is referred by Input(X).");
     AddOutput("Out",
-              "(LodTensor, default LoDTensor<float>) Output LoDTensor which is "
+              "(phi::DenseTensor, default phi::DenseTensor<float>) Output "
+              "phi::DenseTensor which is "
               "generated from Input(X) by referring lod of Input(Y).");
     AddComment(R"DOC(
 Sequence Expand As Operator.
@@ -116,26 +117,26 @@ Following are cases to better explain how this works:
 
 Case 1:
 
-Given a 1-level LoDTensor input(X)
+Given a 1-level phi::DenseTensor input(X)
     X.data = [[a], [b], [c], [d]]
     X.dims = [4, 1]
 and input(Y)
     Y.lod = [[0, 3, 6, 7, 8]]
 ref_level: 0
-then we get 1-level LoDTensor
+then we get 1-level phi::DenseTensor
     Out.lod =  [[0,            3,              6,  7,  8]]
     Out.data = [[a], [a], [a], [b], [b], [b], [c], [d]]
     Out.dims = [8, 1]
 
 Case 2:
 
-Given a common Tensor input(X)
+Given a common phi::DenseTensor input(X)
     X.data = [[a, b], [c, d], [e, f]]
     X.dims = [3, 2]
 and input(Y)
     Y.lod = [[0, 2, 3, 6]]
 ref_level: 0
-then we get a common LoDTensor
+then we get a common phi::DenseTensor
     Out.lod =  [[0,             2,     3,                    6]]
     Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
     Out.dims = [6, 2]
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
index f565e0d438a0e..d5fecace6d767 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using LoDTensor = phi::DenseTensor;
-
 template <typename T>
 static __global__ void sequence_expand_as_kernel(const T *in_data,
                                                  const size_t *expand_offset,
@@ -69,9 +67,9 @@ template <typename T>
 struct SequenceExpandAsFunctor<phi::GPUContext, T> {
   void operator()(
       const phi::GPUContext &context,
-      const LoDTensor &x,
+      const phi::DenseTensor &x,
       const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
-      LoDTensor *out) {
+      phi::DenseTensor *out) {
     int height = x.dims()[0];
     int width = phi::product(x.dims()) / height;
 
@@ -99,9 +97,9 @@ struct SequenceExpandAsFunctor<phi::GPUContext, T> {
 template <typename T>
 struct SequenceExpandAsGradFunctor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext &context,
-                  const LoDTensor &dout,
+                  const phi::DenseTensor &dout,
                   const framework::Vector<size_t> &ref_lod, /*expand based lod*/
-                  LoDTensor *dx) {
+                  phi::DenseTensor *dx) {
     int height = dx->dims()[0];
     int width = phi::product(dx->dims()) / height;
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
index af43aec7931e7..1366fe87ab308 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
@@ -22,7 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using LoDTensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -32,30 +31,30 @@ template <typename DeviceContext, typename T>
 struct SequenceExpandFunctor {
   void operator()(
       const DeviceContext& ctx,
-      const LoDTensor& x,
+      const phi::DenseTensor& x,
       const framework::Vector<size_t>& x_lod,   /*expand source lod*/
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
-      LoDTensor* out);
+      phi::DenseTensor* out);
 };
 
 template <typename DeviceContext, typename T>
 struct SequenceExpandGradFunctor {
   void operator()(
       const DeviceContext& ctx,
-      const LoDTensor& dout,
+      const phi::DenseTensor& dout,
       const framework::Vector<size_t>& x_lod,   /*expand source lod*/
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
-      LoDTensor* dx);
+      phi::DenseTensor* dx);
 };
 
 template <typename T>
 struct SequenceExpandFunctor<phi::CPUContext, T> {
   void operator()(
       const phi::CPUContext& context,
-      const LoDTensor& x,
+      const phi::DenseTensor& x,
       const framework::Vector<size_t>& x_lod,   /*expand source lod*/
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
-      LoDTensor* out) {
+      phi::DenseTensor* out) {
     int out_offset = 0;
     int x_item_length = x.numel() / x.dims()[0];
     auto out_data = out->data<T>();
@@ -88,9 +87,9 @@ template <typename DeviceContext, typename T>
 class SequenceExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<LoDTensor>("X");
-    auto* y = context.Input<LoDTensor>("Y");
-    auto* out = context.Output<LoDTensor>("Out");
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* y = context.Input<phi::DenseTensor>("Y");
+    auto* out = context.Output<phi::DenseTensor>("Out");
 
     int ref_level = context.Attr<int>("ref_level");
     auto& x_lod = x->lod();
@@ -100,7 +99,7 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
         y_lod.empty(),
         false,
         platform::errors::InvalidArgument(
-            "Input(Y) Tensor of SequenceExpandOp does not contain "
+            "Input(Y) phi::DenseTensor of SequenceExpandOp does not contain "
             "LoD information."));
 
     if (ref_level == -1) ref_level = y_lod.size() - 1;
@@ -164,10 +163,10 @@ template <typename T>
 struct SequenceExpandGradFunctor<phi::CPUContext, T> {
   void operator()(
       const phi::CPUContext& context,
-      const LoDTensor& dout,
+      const phi::DenseTensor& dout,
       const framework::Vector<size_t>& x_lod,   /*expand source lod*/
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
-      LoDTensor* dx) {
+      phi::DenseTensor* dx) {
     int dout_offset = 0;
     for (size_t i = 1; i < ref_lod.size(); ++i) {
       int repeat_num = ref_lod[i] - ref_lod[i - 1];
@@ -193,10 +192,11 @@ template <typename DeviceContext, typename T>
 class SequenceExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* g_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* x = context.Input<LoDTensor>("X");
-    auto* y = context.Input<LoDTensor>("Y");
-    auto* g_x = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* g_out =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* y = context.Input<phi::DenseTensor>("Y");
+    auto* g_x = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     int ref_level = context.Attr<int>("ref_level");
 
     g_x->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
index 6c14fa997fe5e..c380779861099 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
@@ -82,7 +82,7 @@ class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker {
 SequenceMask Operator
 
 This operator outputs a Mask according to Input(X) and Attr(maxlen).
-Supposing Input(X) is a Tensor with shape [d_1, d_2, ..., d_n], the
+Supposing Input(X) is a phi::DenseTensor with shape [d_1, d_2, ..., d_n], the
 Output(Y) is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
 
 Y(i_1, i_2, ..., i_n, j) = (j < X(i_1, i_2, ..., i_n))
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
index 87b52174aa8e1..d541f712a5d67 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
@@ -28,9 +28,6 @@
 namespace paddle {
 namespace operators {
 
-using LoDTensor = phi::DenseTensor;
-using Tensor = phi::DenseTensor;
-
 template <typename Tx, typename Ty>
 struct SequenceMaskForRangeFunctor {
   HOSTDEVICE SequenceMaskForRangeFunctor(const Tx *x, Ty *y, int maxlen)
@@ -50,8 +47,11 @@ struct SequenceMaskForRangeFunctor {
 
 template <typename DeviceContext, typename Tx>
 struct SequenceMaskFunctor {
-  SequenceMaskFunctor(
-      const DeviceContext &ctx, const Tx *x, Tensor *y, int limits, int maxlen)
+  SequenceMaskFunctor(const DeviceContext &ctx,
+                      const Tx *x,
+                      phi::DenseTensor *y,
+                      int limits,
+                      int maxlen)
       : ctx_(ctx), x_(x), y_(y), limits_(limits), maxlen_(maxlen) {}
 
   template <typename Ty>
@@ -64,15 +64,13 @@ struct SequenceMaskFunctor {
  private:
   const DeviceContext &ctx_;
   const Tx *x_;
-  Tensor *y_;
+  phi::DenseTensor *y_;
   int limits_;
   int maxlen_;
 };
 
 template <typename DeviceContext, typename Tx>
 class SequenceMaskKernel : public framework::OpKernel<Tx> {
-  using Tensor = phi::DenseTensor;
-
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto *x = ctx.Input<phi::DenseTensor>("X");
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
index 1290e79bc076d..f3b18676abe56 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class SequenceMaskNPUKernel : public framework::OpKernel<T> {
  public:
@@ -58,7 +56,7 @@ class SequenceMaskNPUKernel : public framework::OpKernel<T> {
     auto y_dim = phi::vectorize<int>(x->dims());
     y_dim.push_back(maxlen);
 
-    Tensor cast_x;
+    phi::DenseTensor cast_x;
     cast_x.mutable_data<int32_t>(x->dims(), ctx.GetPlace());
     const auto& cast1_runner = NpuOpRunner(
         "Cast",
@@ -68,7 +66,7 @@ class SequenceMaskNPUKernel : public framework::OpKernel<T> {
           ConvertToNpuDtype(framework::TransToProtoVarType(cast_x.dtype()))}});
     cast1_runner.Run(dev_ctx.stream());
 
-    Tensor tmp;
+    phi::DenseTensor tmp;
     tmp.mutable_data<int32_t>(phi::make_ddim({maxlen}), ctx.GetPlace());
     NpuOpRunner range_runner;
     range_runner.SetType("Range");
@@ -78,7 +76,7 @@ class SequenceMaskNPUKernel : public framework::OpKernel<T> {
     range_runner.AddOutput(tmp);
     range_runner.Run(dev_ctx.stream());
 
-    Tensor expand_tmp;
+    phi::DenseTensor expand_tmp;
     expand_tmp.mutable_data<int32_t>(phi::make_ddim(y_dim), ctx.GetPlace());
     const auto& expand_runner =
         NpuOpRunner("ExpandD", {tmp}, {expand_tmp}, {{"shape", y_dim}});
@@ -87,7 +85,7 @@ class SequenceMaskNPUKernel : public framework::OpKernel<T> {
     auto x_dims = phi::vectorize<int>(x->dims());
     x_dims.push_back(1);
     cast_x.Resize(phi::make_ddim({x_dims}));
-    Tensor x_tmp;
+    phi::DenseTensor x_tmp;
     x_tmp.mutable_data<int32_t>(phi::make_ddim(y_dim), ctx.GetPlace());
     const auto& tile_runner =
         NpuOpRunner("TileWithAxis",
@@ -96,7 +94,7 @@ class SequenceMaskNPUKernel : public framework::OpKernel<T> {
                     {{"axis", x->dims().size()}, {"tiles", maxlen}});
     tile_runner.Run(dev_ctx.stream());
 
-    Tensor y_tmp;
+    phi::DenseTensor y_tmp;
     y_tmp.mutable_data<uint8_t>(phi::make_ddim(y_dim), ctx.GetPlace());
     const auto& less_runner =
         NpuOpRunner("Less", {expand_tmp, x_tmp}, {y_tmp}, {});
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
index d427e339fb9c3..6957920131cea 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
@@ -69,7 +69,7 @@ class SequencePadOp : public framework::OperatorWithKernel {
       // run time
       framework::Variable* x_var =
           PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("X")[0]);
-      const auto& x_lod = x_var->Get<LoDTensor>().lod();
+      const auto& x_lod = x_var->Get<phi::DenseTensor>().lod();
       PADDLE_ENFORCE_EQ(x_lod.empty(),
                         false,
                         platform::errors::NotFound(
@@ -145,20 +145,22 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "(LoDTensor, default LoDTensor<float>) Input variable which "
+             "(phi::DenseTensor, default phi::DenseTensor<float>) Input "
+             "variable which "
              "should contain lod information.");
     AddInput("PadValue",
-             "(LoDTensor), this Tensor holds values that will be fill into "
+             "(phi::DenseTensor), this phi::DenseTensor holds values that will "
+             "be fill into "
              "padded steps. It can be a scalar or a tensor whose shape equals "
              "to time steps in sequences. If it's a scalar, it will be "
              "automatically broadcasted to the shape of time step.");
-    AddOutput(
-        "Out",
-        "(LoDTensor) The output vairable, which contains padded sequences.");
-    AddOutput(
-        "Length",
-        "(LoDTensor) The output vairable, which contains the actual length of "
-        "sequences before padding.");
+    AddOutput("Out",
+              "(phi::DenseTensor) The output vairable, which contains padded "
+              "sequences.");
+    AddOutput("Length",
+              "(phi::DenseTensor) The output vairable, which contains the "
+              "actual length of "
+              "sequences before padding.");
     AddAttr<int>(
         "padded_length",
         "The length of padded sequences. It can be set to -1 or "
@@ -179,41 +181,41 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
 
       Case 1:
 
-      Given a 1-level LoDTensor input(X):
+      Given a 1-level phi::DenseTensor input(X):
           X.lod = [[0, 2,       5]]
           X.data = [a, b, c, d, e]
       and Input(PadValue):
           PadValue.data = [0]
       and attribite 'padded_length' = 4,
-      then we get LoDTensor:
+      then we get phi::DenseTensor:
           Out.data = [[a, b, 0, 0],
                       [c, d, e, 0]]
           Length.data = [2, 3]
 
       Case 2:
 
-      Given a 1-level LoDTensor input(X):
+      Given a 1-level phi::DenseTensor input(X):
           X.lod = [[0,               2,                           5]]
           X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
       and Input(PadValue):
           PadValue.data = [0]
       and attribite 'padded_length' = -1, which mean using the length
       of longest input sequence(3 in this case),
-      then we get LoDTensor:
+      then we get phi::DenseTensor:
           Out.data = [[[a1, a2], [b1, b2], [0, 0]],
                       [[c1, c2], [d1, d2], [e1, e2]]]
           Length.data = [2, 3]
 
       Case 3:
 
-      Given a 1-level LoDTensor input(X):
+      Given a 1-level phi::DenseTensor input(X):
           X.lod = [[0,               2,                           5]]
           X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
       and Input(PadValue):
           PadValue.data = [p1, p2]
       and attribite 'padded_length' = -1, which mean using the length
       of longest input sequence(3 in this case),
-      then we get LoDTensor:
+      then we get phi::DenseTensor:
           Out.data = [[[a1, a2], [b1, b2], [p1, p2]],
                       [[c1, c2], [d1, d2], [e1, e2]]]
           Length.data = [2, 3]
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
index 6f9026095756a..0615e0c943e25 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
@@ -24,25 +24,24 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using LoDTensor = phi::DenseTensor;
 using LoD = framework::LoD;
 
 template <typename DeviceContext, typename T>
 class SequencePadOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* x = ctx.Input<LoDTensor>("X");
-    auto* out = ctx.Output<LoDTensor>("Out");
-    auto* len_t = ctx.Output<LoDTensor>("Length");
+    const auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
+    auto* len_t = ctx.Output<phi::DenseTensor>("Length");
     out->mutable_data<T>(ctx.GetPlace());
 
-    PADDLE_ENFORCE_EQ(
-        x->lod().empty(),
-        false,
-        platform::errors::NotFound("Input(X) Tensor of SequencePadOp does not "
-                                   "contain LoD information."));
+    PADDLE_ENFORCE_EQ(x->lod().empty(),
+                      false,
+                      platform::errors::NotFound(
+                          "Input(X) phi::DenseTensor of SequencePadOp does not "
+                          "contain LoD information."));
 
-    const auto* pad_value = ctx.Input<LoDTensor>("PadValue");
+    const auto* pad_value = ctx.Input<phi::DenseTensor>("PadValue");
 
     int padded_length = ctx.Attr<int>("padded_length");
 
@@ -56,7 +55,7 @@ class SequencePadOpKernel : public framework::OpKernel<T> {
         false,
         math::kBatchLengthWidth);
 
-    LoDTensor seq_len;
+    phi::DenseTensor seq_len;
     seq_len.Resize(len_t->dims());
     int64_t* len_data = seq_len.mutable_data<int64_t>(platform::CPUPlace());
     for (size_t i = 1; i < x->lod()[0].size(); ++i) {
@@ -73,9 +72,10 @@ template <typename DeviceContext, typename T>
 class SequencePadGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     if (d_x) {
-      const auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+      const auto* d_out =
+          ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
       d_x->mutable_data<T>(ctx.GetPlace());
 
       int padded_length = ctx.Attr<int>("padded_length");
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
index 9b8697b976633..778b2f8854945 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
@@ -53,12 +53,15 @@ class SequencePoolOp : public framework::OperatorWithKernel {
 class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
-    AddOutput("Out",
-              "(Tensor) The output of SequencePoolOp does not contain LoD "
-              "information.");
+    AddInput("X",
+             "(phi::DenseTensor) The variable-length input of SequencePoolOp");
+    AddOutput(
+        "Out",
+        "(phi::DenseTensor) The output of SequencePoolOp does not contain LoD "
+        "information.");
     AddOutput("MaxIndex",
-              "(Tensor<int>) This tensor is used for the sequence max-pooling "
+              "(phi::DenseTensor<int>) This tensor is used for the sequence "
+              "max-pooling "
               "to record the max indexes.")
         .AsIntermediate();
     AddAttr<bool>("is_test",
@@ -92,11 +95,11 @@ The following example explains how this works:
 For a mini-batch of 3 variable-length sentences,
 containing 2, 3, and 2 time-steps:
 
-Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
+Assume X is a [7,M,N] phi::DenseTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
 Besides, for the sake of simplicity, we assume M=1 and N=1,
 and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
 
-Thus, Out is a [3,1,1] Tensor without LoD information.
+Thus, Out is a [3,1,1] phi::DenseTensor without LoD information.
 And for different pooltype, the value of Out is as follows:
 
 - AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
index 199187a1544c7..78acb4eef28a7 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
@@ -23,15 +23,12 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-using LoDTensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class SequencePoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
+    auto* in = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     std::string pooltype = context.Attr<std::string>("pooltype");
     T pad_value = static_cast<T>(context.Attr<float>("pad_value"));
 
@@ -39,11 +36,11 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     auto lod = in->lod();
     auto lod_level = lod.size();
     // InferShape by lod
-    PADDLE_ENFORCE_GT(
-        lod_level,
-        0,
-        platform::errors::InvalidArgument("Input(X) Tensor of SequencePoolOp "
-                                          "does not contain LoD information."));
+    PADDLE_ENFORCE_GT(lod_level,
+                      0,
+                      platform::errors::InvalidArgument(
+                          "Input(X) phi::DenseTensor of SequencePoolOp "
+                          "does not contain LoD information."));
     PADDLE_ENFORCE_LE(lod_level,
                       2UL,
                       platform::errors::InvalidArgument(
@@ -100,8 +97,9 @@ template <typename DeviceContext, typename T>
 class SequencePoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* out_g =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* in_g = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     std::string pooltype = context.Attr<std::string>("pooltype");
     const phi::DenseTensor* index = nullptr;
     if (pooltype == "MAX") {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
index aeae0a0e1fdd5..03036a0babf3b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
@@ -19,33 +19,30 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-using LoDTensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 struct SequenceSoftmaxFunctor {
   void operator()(
       const DeviceContext &ctx,
-      const LoDTensor &x,
+      const phi::DenseTensor &x,
       const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
-      LoDTensor *out);
+      phi::DenseTensor *out);
 };
 
 template <typename DeviceContext, typename T>
 struct SequenceSoftmaxGradFunctor {
   void operator()(const DeviceContext &ctx,
-                  const LoDTensor &dout,
-                  const LoDTensor &out,
+                  const phi::DenseTensor &dout,
+                  const phi::DenseTensor &out,
                   const framework::Vector<size_t> &ref_lod, /*referenced lod*/
-                  LoDTensor *dx);
+                  phi::DenseTensor *dx);
 };
 
 template <typename T>
 struct SequenceSoftmaxFunctor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext &ctx,
-                  const LoDTensor &x,
+                  const phi::DenseTensor &x,
                   const framework::Vector<size_t> &ref_lod, /*referenced lod*/
-                  LoDTensor *out) {
+                  phi::DenseTensor *out) {
     size_t height = ref_lod.size() - 1;
     const T *in_data = x.data<T>();
     T *out_data = out->mutable_data<T>(ctx.GetPlace());
@@ -65,10 +62,10 @@ struct SequenceSoftmaxFunctor<phi::CPUContext, T> {
 template <typename T>
 struct SequenceSoftmaxGradFunctor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext &ctx,
-                  const LoDTensor &dout,
-                  const LoDTensor &out,
+                  const phi::DenseTensor &dout,
+                  const phi::DenseTensor &out,
                   const framework::Vector<size_t> &ref_lod, /*referenced lod*/
-                  LoDTensor *dx) {
+                  phi::DenseTensor *dx) {
     size_t height = ref_lod.size() - 1;
 
     const T *softmax_grad_data = dout.data<T>();
@@ -94,17 +91,17 @@ template <typename DeviceContext, typename T>
 class SequenceSoftmaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<LoDTensor>("X");
-    auto *out = ctx.Output<LoDTensor>("Out");
+    auto *x = ctx.Input<phi::DenseTensor>("X");
+    auto *out = ctx.Output<phi::DenseTensor>("Out");
 
     auto lod = x->lod();
     auto dims = x->dims();
-    PADDLE_ENFORCE_EQ(
-        lod.empty(),
-        false,
-        platform::errors::InvalidArgument(
-            "Input(X) Tensor of SequenceSoftmax operator does not contain "
-            "LoD information."));
+    PADDLE_ENFORCE_EQ(lod.empty(),
+                      false,
+                      platform::errors::InvalidArgument(
+                          "Input(X) phi::DenseTensor of SequenceSoftmax "
+                          "operator does not contain "
+                          "LoD information."));
 
     const size_t level = lod.size() - 1;
     PADDLE_ENFORCE_EQ(
@@ -138,10 +135,10 @@ template <typename DeviceContext, typename T>
 class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *out = ctx.Input<LoDTensor>("Out");
-    auto *out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto *x = ctx.Input<LoDTensor>("X");
-    auto *x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto *out = ctx.Input<phi::DenseTensor>("Out");
+    auto *out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto *x = ctx.Input<phi::DenseTensor>("X");
+    auto *x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     if (!x_grad) {
       return;
     }

From d8c9f19ac44ddf2299e975bdf5732e63bc6b39ec Mon Sep 17 00:00:00 2001
From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com>
Date: Mon, 5 Dec 2022 11:40:41 +0800
Subject: [PATCH 02/13] move paddle.fluid.layers.tensor.create_tensor to
 paddle.tensor.creation.create_tensor (#48662)

---
 python/paddle/distribution/distribution.py    |  2 +-
 python/paddle/fluid/layers/tensor.py          | 43 ------------------
 python/paddle/fluid/tests/test_if_else_op.py  |  4 +-
 .../fleet/dist_mnist_gradient_merge.py        |  2 +-
 .../collective/fleet/pipeline_mnist.py        |  2 +-
 .../fleet/pipeline_mnist_multi_device.py      |  2 +-
 .../fleet/pipeline_mnist_one_device.py        |  2 +-
 .../tests/unittests/dist_allreduce_op.py      |  2 +-
 .../dist_fleet_raw_program_optimizer.py       |  2 +-
 ...et_raw_program_optimizer_fuse_allreduce.py |  2 +-
 .../fluid/tests/unittests/dist_mnist.py       |  2 +-
 .../tests/unittests/dist_mnist_batch_merge.py |  2 +-
 .../unittests/dist_mnist_fp16_allreduce.py    |  2 +-
 .../fluid/tests/unittests/dist_mnist_lars.py  |  2 +-
 .../test_distribution_categorical.py          |  3 +-
 .../unittests/npu/test_assign_value_op_npu.py |  2 +-
 .../tests/unittests/test_assign_value_op.py   |  2 +-
 .../tests/unittests/test_conditional_block.py |  2 +-
 .../fluid/tests/unittests/test_desc_clone.py  |  2 +-
 .../fluid/tests/unittests/test_fetch_var.py   |  5 ++-
 .../fluid/tests/unittests/test_profiler.py    |  2 +-
 .../tests/unittests/test_square_error_cost.py |  5 +--
 .../unittests/xpu/test_assign_value_op_xpu.py |  2 +-
 python/paddle/tensor/__init__.py              |  4 ++
 python/paddle/tensor/creation.py              | 44 ++++++++++++++++++-
 25 files changed, 74 insertions(+), 70 deletions(-)

diff --git a/python/paddle/distribution/distribution.py b/python/paddle/distribution/distribution.py
index 15ee5d8e011e4..ae4cb2f9b16ef 100644
--- a/python/paddle/distribution/distribution.py
+++ b/python/paddle/distribution/distribution.py
@@ -203,7 +203,7 @@ def _to_tensor(self, *args):
         dtype = tmp.dtype
         for arg in numpy_args:
             arg_broadcasted, _ = np.broadcast_arrays(arg, tmp)
-            arg_variable = tensor.create_tensor(dtype=dtype)
+            arg_variable = paddle.tensor.create_tensor(dtype=dtype)
             tensor.assign(arg_broadcasted, arg_variable)
             variable_args.append(arg_variable)
 
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index e131744cd8685..6a88b6828fb85 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -47,7 +47,6 @@
 from paddle import _C_ops, _legacy_C_ops
 
 __all__ = [
-    'create_tensor',
     'create_global_var',
     'cast',
     'tensor_array_to_tensor',
@@ -62,48 +61,6 @@
 ]
 
 
-def create_tensor(dtype, name=None, persistable=False):
-    """
-    Create a variable, which will hold a Tensor with data type dtype.
-
-    Args:
-        dtype(string|numpy.dtype): the data type of Tensor to be created, the
-            data type is bool, float16, float32, float64, int8, int16, int32 and int64.
-        name(string, optional): The default value is None.  Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
-        persistable(bool): Set the persistable flag of the create tensor.
-            default value is False.
-
-    Returns:
-        Variable: The tensor to be created according to dtype.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          tensor = fluid.layers.create_tensor(dtype='float32')
-    """
-    check_dtype(
-        dtype,
-        'dtype',
-        [
-            'bool',
-            'float16',
-            'float32',
-            'float64',
-            'int8',
-            'int32',
-            'int32',
-            'int64',
-        ],
-        'create_tensor',
-    )
-    helper = LayerHelper("create_tensor", **locals())
-    return helper.create_variable(
-        name=helper.name, dtype=dtype, persistable=persistable
-    )
-
-
 def create_global_var(
     shape, value, dtype, persistable=False, force_cpu=False, name=None
 ):
diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
index 1eba6cbb60ee1..24857164dc30b 100644
--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -46,7 +46,7 @@ def not_test_raw_api(self):
             cond = paddle.less_than(x=label, y=limit)
             true_image, false_image = split_lod_tensor(input=image, mask=cond)
 
-            true_out = layers.create_tensor(dtype='float32')
+            true_out = paddle.tensor.create_tensor(dtype='float32')
             true_cond = ConditionalBlock([cond])
 
             with true_cond.block():
@@ -54,7 +54,7 @@ def not_test_raw_api(self):
                 prob = layers.fc(input=hidden, size=10, act='softmax')
                 layers.assign(input=prob, output=true_out)
 
-            false_out = layers.create_tensor(dtype='float32')
+            false_out = paddle.tensor.create_tensor(dtype='float32')
             false_cond = ConditionalBlock([cond])
 
             with false_cond.block():
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/dist_mnist_gradient_merge.py b/python/paddle/fluid/tests/unittests/collective/fleet/dist_mnist_gradient_merge.py
index 1e67d722040c4..85394ea89da4e 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/dist_mnist_gradient_merge.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dist_mnist_gradient_merge.py
@@ -38,7 +38,7 @@ def get_model(self, batch_size=2):
         avg_cost = paddle.mean(x=cost)
 
         # Evaluator
-        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_size_tensor = paddle.tensor.create_tensor(dtype='int64')
         batch_acc = paddle.static.accuracy(
             input=predict, label=label, total=batch_size_tensor
         )
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
index 59572a5e7deac..4530d8e24216a 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
@@ -105,7 +105,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
 
         # Evaluator
         with fluid.device_guard("gpu:1"):
-            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_size_tensor = paddle.tensor.create_tensor(dtype='int64')
             batch_acc = paddle.static.accuracy(
                 input=predict, label=label, total=batch_size_tensor
             )
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
index e60b6bbbcd428..0b75b034ce46e 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
@@ -105,7 +105,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
 
         # Evaluator
         with fluid.device_guard("gpu:1"):
-            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_size_tensor = paddle.tensor.create_tensor(dtype='int64')
             batch_acc = paddle.static.accuracy(
                 input=predict, label=label, total=batch_size_tensor
             )
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_one_device.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_one_device.py
index 8ecea66aaa6bc..5b1e590fc0058 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_one_device.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_one_device.py
@@ -97,7 +97,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
 
         # Evaluator
         with fluid.device_guard("gpu:0"):
-            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_size_tensor = paddle.tensor.create_tensor(dtype='int64')
             batch_acc = paddle.static.accuracy(
                 input=predict, label=label, total=batch_size_tensor
             )
diff --git a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
index bd3a6d659a39e..c956f287d7b14 100644
--- a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
@@ -81,7 +81,7 @@ def get_model(self, batch_size=2, single_device=False):
         avg_cost = paddle.mean(x=cost)
 
         # Evaluator
-        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_size_tensor = paddle.tensor.create_tensor(dtype='int64')
         batch_acc = paddle.static.accuracy(
             input=predict, label=label, total=batch_size_tensor
         )
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
index 90c1ea16a82c5..7c98169433b0b 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
@@ -83,7 +83,7 @@ def get_model(self, batch_size=2, single_device=False):
         avg_cost = paddle.mean(x=cost)
 
         # Evaluator
-        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_size_tensor = paddle.tensor.create_tensor(dtype='int64')
         batch_acc = paddle.static.accuracy(
             input=predict, label=label, total=batch_size_tensor
         )
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py
index 98d7ef1d1569d..e46173735a967 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py
@@ -83,7 +83,7 @@ def get_model(self, batch_size=2, single_device=False):
         avg_cost = paddle.mean(x=cost)
 
         # Evaluator
-        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_size_tensor = paddle.tensor.create_tensor(dtype='int64')
         batch_acc = paddle.static.accuracy(
             input=predict, label=label, total=batch_size_tensor
         )
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
index 3cecc8b32c0b0..819b959a1fa8d 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -82,7 +82,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         avg_cost = paddle.mean(x=cost)
 
         # Evaluator
-        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_size_tensor = paddle.tensor.create_tensor(dtype='int64')
         batch_acc = paddle.static.accuracy(
             input=predict, label=label, total=batch_size_tensor
         )
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py b/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
index 4cda9dd53a723..aa963ab012bc0 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
@@ -47,7 +47,7 @@ def get_model(self, batch_size=2):
         avg_cost = paddle.mean(x=cost)
 
         # Evaluator
-        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_size_tensor = paddle.tensor.create_tensor(dtype='int64')
         batch_acc = paddle.static.accuracy(
             input=predict, label=label, total=batch_size_tensor
         )
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py b/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
index 53819ca5491d4..ad0b25e8ea15a 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
@@ -41,7 +41,7 @@ def get_model(self, batch_size=2):
         avg_cost = paddle.mean(x=cost)
 
         # Evaluator
-        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_size_tensor = paddle.tensor.create_tensor(dtype='int64')
         batch_acc = paddle.static.accuracy(
             input=predict, label=label, total=batch_size_tensor
         )
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_lars.py b/python/paddle/fluid/tests/unittests/dist_mnist_lars.py
index 347692afdd0b1..b886ad8953461 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist_lars.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_lars.py
@@ -38,7 +38,7 @@ def get_model(self, batch_size=2):
         avg_cost = paddle.mean(x=cost)
 
         # Evaluator
-        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_size_tensor = paddle.tensor.create_tensor(dtype='int64')
         batch_acc = paddle.static.accuracy(
             input=predict, label=label, total=batch_size_tensor
         )
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py
index 5dfcedcc0c5a6..91e5b22576743 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py
@@ -20,7 +20,6 @@
 import paddle
 from paddle import fluid
 from paddle.distribution import Categorical, Distribution, Normal, Uniform
-from paddle.fluid import layers
 
 np.random.seed(2022)
 
@@ -380,7 +379,7 @@ def test_distribution_error(self):
         )
 
         value_npdata = np.array([0.8], dtype="float32")
-        value_tensor = layers.create_tensor(dtype="float32")
+        value_tensor = paddle.tensor.create_tensor(dtype="float32")
         self.assertRaises(
             NotImplementedError, distribution.log_prob, value_tensor
         )
diff --git a/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py
index 402b90bc49bbd..1df24e54a16b3 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py
@@ -93,7 +93,7 @@ def init_dtype(self):
     def test_assign(self):
         main_program = fluid.Program()
         with fluid.program_guard(main_program):
-            x = layers.create_tensor(dtype=self.dtype)
+            x = paddle.tensor.create_tensor(dtype=self.dtype)
             layers.assign(input=self.value, output=x)
 
         exe = fluid.Executor(self.place)
diff --git a/python/paddle/fluid/tests/unittests/test_assign_value_op.py b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
index 7a5128ed2ff15..c0a5554d39b97 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
@@ -83,7 +83,7 @@ def init_dtype(self):
     def test_assign(self):
         main_program = fluid.Program()
         with fluid.program_guard(main_program):
-            x = layers.create_tensor(dtype=self.dtype)
+            x = paddle.tensor.create_tensor(dtype=self.dtype)
             layers.assign(input=self.value, output=x)
 
         exe = fluid.Executor(self.place)
diff --git a/python/paddle/fluid/tests/unittests/test_conditional_block.py b/python/paddle/fluid/tests/unittests/test_conditional_block.py
index 418ae3875998e..1eaf25dc34877 100644
--- a/python/paddle/fluid/tests/unittests/test_conditional_block.py
+++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py
@@ -33,7 +33,7 @@ def test_forward(self):
             data = layers.data(name='X', shape=[1], dtype='float32')
             data.stop_gradient = False
             cond = ConditionalBlock(inputs=[data])
-            out = layers.create_tensor(dtype='float32')
+            out = paddle.tensor.create_tensor(dtype='float32')
             with cond.block():
                 hidden = layers.fc(input=data, size=10)
                 layers.assign(hidden, out)
diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py
index 7d4f1f0975fc4..477910f53d59d 100644
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -77,7 +77,7 @@ def get_model(batch_size):
     avg_cost = paddle.mean(x=cost)
 
     # Evaluator
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_size_tensor = paddle.tensor.create_tensor(dtype='int64')
     batch_acc = paddle.static.accuracy(
         input=predict, label=label, total=batch_size_tensor
     )
diff --git a/python/paddle/fluid/tests/unittests/test_fetch_var.py b/python/paddle/fluid/tests/unittests/test_fetch_var.py
index 4339813584a90..3303e30a4f3ad 100644
--- a/python/paddle/fluid/tests/unittests/test_fetch_var.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_var.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 
@@ -26,7 +27,9 @@ def set_input(self):
 
     def test_fetch_var(self):
         self.set_input()
-        x = layers.create_tensor(dtype="int32", persistable=True, name="x")
+        x = paddle.tensor.create_tensor(
+            dtype="int32", persistable=True, name="x"
+        )
         layers.assign(input=self.val, output=x)
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_main_program(), feed={}, fetch_list=[])
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index 6b414afbe4a7f..62d46d4cadc48 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -59,7 +59,7 @@ def build_program(self, compile_program=True):
             label = fluid.layers.data(name='y', shape=[1], dtype='int64')
             cost = fluid.layers.cross_entropy(input=predict, label=label)
             avg_cost = paddle.mean(cost)
-            batch_size = fluid.layers.create_tensor(dtype='int64')
+            batch_size = paddle.tensor.create_tensor(dtype='int64')
             batch_acc = paddle.static.accuracy(
                 input=predict, label=label, total=batch_size
             )
diff --git a/python/paddle/fluid/tests/unittests/test_square_error_cost.py b/python/paddle/fluid/tests/unittests/test_square_error_cost.py
index 7828f01b02fe6..afd16a3095738 100644
--- a/python/paddle/fluid/tests/unittests/test_square_error_cost.py
+++ b/python/paddle/fluid/tests/unittests/test_square_error_cost.py
@@ -19,7 +19,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-import paddle.fluid.layers as layers
 from paddle.fluid.executor import Executor
 
 
@@ -31,8 +30,8 @@ def test_square_error_cost(self):
         sub = input_val - label_val
         np_result = sub * sub
 
-        input_var = layers.create_tensor(dtype="float32", name="input")
-        label_var = layers.create_tensor(dtype="float32", name="label")
+        input_var = paddle.tensor.create_tensor(dtype="float32", name="input")
+        label_var = paddle.tensor.create_tensor(dtype="float32", name="label")
         output = paddle.nn.functional.square_error_cost(
             input=input_var, label=label_var
         )
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py
index 7de6af1b45c3c..560815cb56bee 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py
@@ -94,7 +94,7 @@ def init_dtype(self):
     def test_assign(self):
         main_program = fluid.Program()
         with fluid.program_guard(main_program):
-            x = layers.create_tensor(dtype=self.dtype)
+            x = paddle.tensor.create_tensor(dtype=self.dtype)
             layers.assign(input=self.value, output=x)
 
         exe = fluid.Executor(self.place)
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 4c1ec07838050..89df1b6ac3b47 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -19,6 +19,8 @@
 from .attribute import real  # noqa: F401
 from .attribute import imag  # noqa: F401
 from .attribute import is_floating_point  # noqa: F401
+from .creation import create_parameter  # noqa: F401
+from .creation import create_tensor  # noqa: F401
 from .creation import to_tensor  # noqa: F401
 from .creation import diag  # noqa: F401
 from .creation import diagflat  # noqa: F401
@@ -289,6 +291,8 @@
 
 # this list used in math_op_patch.py for _binary_creator_
 tensor_method_func = [  # noqa
+    'create_parameter',
+    'create_tensor',
     'matmul',
     'dot',
     'cov',
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index c969ee3639bf9..134e27eef9df6 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -100,7 +100,7 @@ def create_parameter(
 
             import paddle
             paddle.enable_static()
-            W = paddle.static.create_parameter(shape=[784, 200], dtype='float32')
+            W = paddle.create_parameter(shape=[784, 200], dtype='float32')
     """
     check_type(shape, 'shape', (list, tuple, np.ndarray), 'create_parameter')
     for item in shape:
@@ -150,6 +150,48 @@ def create_parameter(
     )
 
 
+def create_tensor(dtype, name=None, persistable=False):
+    """
+    Create a variable, which will hold a Tensor with data type dtype.
+
+    Args:
+        dtype(string|numpy.dtype): the data type of Tensor to be created, the
+            data type is bool, float16, float32, float64, int8, int16, int32 and int64.
+        name(string, optional): The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        persistable(bool): Set the persistable flag of the create tensor.
+            default value is False.
+
+    Returns:
+        Variable: The tensor to be created according to dtype.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          tensor = paddle.tensor.create_tensor(dtype='float32')
+    """
+    check_dtype(
+        dtype,
+        'dtype',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int8',
+            'int32',
+            'int32',
+            'int64',
+        ],
+        'create_tensor',
+    )
+    helper = LayerHelper("create_tensor", **locals())
+    return helper.create_variable(
+        name=helper.name, dtype=dtype, persistable=persistable
+    )
+
+
 def linspace(start, stop, num, dtype=None, name=None):
     r"""
     Return fixed number of evenly spaced values within a given interval.

From 0ebace143854c94dece090fd83baeb9d45757c03 Mon Sep 17 00:00:00 2001
From: Matsumoto Ruko <38883252+gsq7474741@users.noreply.github.com>
Date: Mon, 5 Dec 2022 11:54:11 +0800
Subject: [PATCH 03/13] remove deprecated warnings for py36 (#48639)

---
 python/paddle/utils/deprecated.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/python/paddle/utils/deprecated.py b/python/paddle/utils/deprecated.py
index 7854f12aa9c10..cd9e6947b77b3 100755
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
@@ -23,14 +23,6 @@
 
 __all__ = []
 
-# NOTE(zhiqiu): Since python 3.2, DeprecationWarning is ignored by default,
-# and since python 3.7, it is once again shown by default when triggered directly by code in __main__.
-# See details: https://docs.python.org/3/library/warnings.html#default-warning-filter
-# The following line set DeprecationWarning to show once, which is expected to work in python 3.2 -> 3.6
-# However, doing this could introduce one samll side effect, i.e., the DeprecationWarning which is not issued by @deprecated.
-# The side effect is acceptable, and we will find better way to do this if we could.
-warnings.simplefilter('default', DeprecationWarning)
-
 
 def deprecated(update_to="", since="", reason="", level=0):
     """Decorate a function to signify its deprecation.

From 89f024e35167a39f72dc956b170aa9a291595c3f Mon Sep 17 00:00:00 2001
From: 201716010711 <87008376+201716010711@users.noreply.github.com>
Date: Mon, 5 Dec 2022 12:56:12 +0800
Subject: [PATCH 04/13] delete shape api (#48546)

---
 python/paddle/distribution/normal.py          |  2 +-
 .../paddle/fluid/contrib/layers/rnn_impl.py   |  4 +-
 python/paddle/fluid/layers/detection.py       |  2 +-
 python/paddle/fluid/layers/nn.py              | 90 -------------------
 python/paddle/fluid/layers/rnn.py             |  4 +-
 .../dygraph_to_static/ifelse_simple_func.py   | 18 ++--
 .../seq2seq_dygraph_model.py                  |  8 +-
 .../dygraph_to_static/simnet_dygraph_model.py |  2 +-
 .../unittests/dygraph_to_static/test_bmn.py   |  4 +-
 .../unittests/dygraph_to_static/test_dict.py  |  4 +-
 .../unittests/dygraph_to_static/test_lac.py   |  2 +-
 .../dygraph_to_static/test_tensor_shape.py    | 13 ++-
 .../unittests/dygraph_to_static/yolov3.py     |  2 +-
 .../test_dynamic_rnn_stop_gradient.py         |  2 +-
 .../fluid/tests/unittests/test_layers.py      |  2 +-
 .../tests/unittests/test_rnn_cell_api.py      |  2 +-
 .../tests/unittests/test_rnn_decode_api.py    |  4 +-
 ...tatic_shape_inferrence_for_shape_tensor.py |  2 +-
 .../tests/unittests/test_while_loop_op.py     |  2 +-
 .../paddle/jit/dy2static/ast_transformer.py   |  2 +-
 .../paddle/jit/dy2static/convert_operators.py |  5 +-
 python/paddle/tensor/attribute.py             |  2 +-
 22 files changed, 45 insertions(+), 133 deletions(-)

diff --git a/python/paddle/distribution/normal.py b/python/paddle/distribution/normal.py
index 3eb3fd2d59009..7eb9fb597d3a2 100644
--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -180,7 +180,7 @@ def sample(self, shape=(), seed=0):
                 self.loc + self.scale, batch_shape + shape, self.dtype, 0.0
             )
             zero_tmp_reshape = paddle.reshape(zero_tmp, output_shape)
-            zero_tmp_shape = nn.shape(zero_tmp_reshape)
+            zero_tmp_shape = paddle.shape(zero_tmp_reshape)
             normal_random_tmp = nn.gaussian_random(
                 zero_tmp_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
             )
diff --git a/python/paddle/fluid/contrib/layers/rnn_impl.py b/python/paddle/fluid/contrib/layers/rnn_impl.py
index b28cac87950c9..4e23057fc4680 100644
--- a/python/paddle/fluid/contrib/layers/rnn_impl.py
+++ b/python/paddle/fluid/contrib/layers/rnn_impl.py
@@ -330,7 +330,7 @@ def basic_gru(
 
     mask = None
     if sequence_length:
-        max_seq_len = layers.shape(input)[0]
+        max_seq_len = paddle.shape(input)[0]
         mask = layers.sequence_mask(
             sequence_length, maxlen=max_seq_len, dtype='float32'
         )
@@ -614,7 +614,7 @@ def basic_lstm(
 
     mask = None
     if sequence_length:
-        max_seq_len = layers.shape(input)[0]
+        max_seq_len = paddle.shape(input)[0]
         mask = layers.sequence_mask(
             sequence_length, maxlen=max_seq_len, dtype='float32'
         )
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index d490b0457d98c..dddd2fa386d2b 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -1588,7 +1588,7 @@ def ssd_loss(
         raise ValueError("Only support mining_type == max_negative now.")
 
     num, num_prior, num_class = confidence.shape
-    conf_shape = nn.shape(confidence)
+    conf_shape = paddle.shape(confidence)
 
     def __reshape_to_2d(var):
         out = paddle.flatten(var, 2, -1)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 4dab44ebe54fe..fda4c24e51068 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -97,7 +97,6 @@
     'elementwise_mul',
     'gaussian_random',
     'sampling_id',
-    'shape',
     'clip',
     'clip_by_norm',
     'mean',
@@ -5010,95 +5009,6 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
     return out
 
 
-def shape(input):
-    """
-    :alias_main: paddle.shape
-        :alias: paddle.shape,paddle.tensor.shape,paddle.tensor.attribute.shape
-        :old_api: paddle.fluid.layers.shape
-
-    **Shape Layer**
-
-    Get the shape of the input.
-
-    .. code-block:: text
-
-        Case1:
-            Given N-D Tensor:
-                input = [ [1, 2, 3, 4], [5, 6, 7, 8] ]
-
-            Then:
-                input.shape = [2, 4]
-
-        Case2:
-            Given SelectedRows:
-                input.rows = [0, 4, 19]
-                input.height = 20
-                input.value = [ [1, 2], [3, 4], [5, 6] ]  # inner tensor
-            Then:
-                input.shape = [3, 2]
-
-    Args:
-        input (Variable): The input can be N-D Tensor or SelectedRows with data type bool, float16, float32, float64, int32, int64.
-                          If input variable is type of SelectedRows, returns the shape of it's inner tensor.
-
-    Returns:
-        Variable (Tensor): The shape of the input variable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy as np
-            import paddle
-            paddle.enable_static()
-
-            inputs = fluid.data(name="x", shape=[3, 100, 100], dtype="float32")
-            output = fluid.layers.shape(inputs)
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(fluid.default_startup_program())
-
-            img = np.ones((3, 100, 100)).astype(np.float32)
-
-            res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output])
-            print(res) # [array([  3, 100, 100], dtype=int32)]
-    """
-    if in_dygraph_mode():
-        out = _C_ops.shape(input)
-        out.stop_gradient = True
-        return out
-    if _in_legacy_dygraph():
-        out = _legacy_C_ops.shape(input)
-        out.stop_gradient = True
-        return out
-
-    check_variable_and_dtype(
-        input,
-        'input',
-        [
-            'bool',
-            'float16',
-            'float32',
-            'float64',
-            'int32',
-            'int64',
-            'complex64',
-            'complex128',
-        ],
-        'shape',
-    )
-    helper = LayerHelper('shape', **locals())
-    out = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type='shape',
-        inputs={'Input': input},
-        outputs={'Out': out},
-        stop_gradient=True,
-    )
-
-    return out
-
-
 def _elementwise_op(helper):
     op_type = helper.layer_type
     x = helper.kwargs.get('x', None)
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 60ac537ffc6d4..8b5721438d2e5 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -673,7 +673,7 @@ def _switch_grad(x, stop=False):
         inputs = map_structure(_transpose_batch_time, inputs)
 
     if sequence_length:
-        max_seq_len = nn.shape(flatten(inputs)[0])[0]
+        max_seq_len = paddle.shape(flatten(inputs)[0])[0]
         mask = sequence_lod.sequence_mask(
             sequence_length,
             maxlen=max_seq_len,
@@ -1215,7 +1215,7 @@ def initialize(self, initial_cell_states):
         """
         self.kinf = 1e9
         state = flatten(initial_cell_states)[0]
-        self.batch_size = nn.shape(state)[0]
+        self.batch_size = paddle.shape(state)[0]
 
         self.start_token_tensor = tensor.fill_constant(
             shape=[1], dtype="int64", value=self.start_token
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
index 2fa012559cc77..985d091d6b9c4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
@@ -151,7 +151,7 @@ def nested_if_else(x_v):
         #  `x_v.shape[0]` is not Tensor, and `batch_size` is the return value of `true_fn` after transformed.
         # col = -1
         # batch_size = x_v.shape[0]
-        batch_size = fluid.layers.shape(x_v)[0]
+        batch_size = paddle.shape(x_v)[0]
 
     # if tensor.shape is [1], now support to compare with numpy.
     if paddle.mean(x_v).numpy() < 0:
@@ -180,7 +180,7 @@ def nested_if_else_2(x):
         z = y
     x_shape_0 = x.shape[0]
     if x_shape_0 < 1:
-        if fluid.layers.shape(y).numpy()[0] < 1:
+        if paddle.shape(y).numpy()[0] < 1:
             res = fluid.layers.fill_constant(
                 value=2, shape=x.shape, dtype="int32"
             )
@@ -212,7 +212,7 @@ def nested_if_else_3(x):
         else:
             out = x - 1
     else:
-        y_shape = fluid.layers.shape(y)
+        y_shape = paddle.shape(y)
         if y_shape.numpy()[0] < 1:
             res = fluid.layers.fill_constant(
                 value=2, shape=x.shape, dtype="int32"
@@ -290,7 +290,7 @@ def forward(self, input):
 
 
 def if_with_and_or(x_v, label=None):
-    batch_size = fluid.layers.shape(x_v)
+    batch_size = paddle.shape(x_v)
     if (
         x_v is not None
         and (paddle.mean(x_v).numpy()[0] > 0 or label is not None)
@@ -308,7 +308,7 @@ def if_with_and_or(x_v, label=None):
 
 
 def if_with_and_or_1(x, y=None):
-    batch_size = fluid.layers.shape(x)
+    batch_size = paddle.shape(x)
     if batch_size[0] > 1 and y is not None:
         x = x + 1
     if y is not None or batch_size[0] > 1:
@@ -317,7 +317,7 @@ def if_with_and_or_1(x, y=None):
 
 
 def if_with_and_or_2(x, y=None):
-    batch_size = fluid.layers.shape(x)
+    batch_size = paddle.shape(x)
     if x is not None and batch_size[0] > 1 and y is not None:
         x = x + 1
     if batch_size[0] > 1 or y is not None or x is not None:
@@ -326,7 +326,7 @@ def if_with_and_or_2(x, y=None):
 
 
 def if_with_and_or_3(x, y=None):
-    batch_size = fluid.layers.shape(x)
+    batch_size = paddle.shape(x)
     mean_res = paddle.mean(x)
     if (
         x is not None
@@ -341,7 +341,7 @@ def if_with_and_or_3(x, y=None):
 
 
 def if_with_and_or_4(x, y=None):
-    batch_size = fluid.layers.shape(x)
+    batch_size = paddle.shape(x)
     mean_res = paddle.mean(x)
     if (x is not None and batch_size[0] > 1) or (
         y is not None and mean_res.numpy()[0] > 0
@@ -361,7 +361,7 @@ def __init__(self):
             self.b = 2
 
     foo = Foo()
-    batch_size = fluid.layers.shape(x)
+    batch_size = paddle.shape(x)
     mean_res = paddle.mean(x)
 
     if batch_size[0] > foo.a:
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
index d364b8a1a5d60..5c04aecd1a87f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
@@ -297,8 +297,8 @@ def forward(self, inputs):
         loss = paddle.nn.functional.softmax_with_cross_entropy(
             logits=dec_output, label=label, soft_label=False
         )
-        loss = paddle.squeeze(loss, axis=[2])
-        max_tar_seq_len = fluid.layers.shape(tar)[1]
+        loss = paddle.squeeze(loss, axes=[2])
+        max_tar_seq_len = paddle.shape(tar)[1]
         tar_mask = fluid.layers.sequence_mask(
             tar_sequence_length, maxlen=max_tar_seq_len, dtype='float32'
         )
@@ -833,8 +833,8 @@ def forward(self, inputs):
         loss = paddle.nn.functional.softmax_with_cross_entropy(
             logits=dec_output, label=label, soft_label=False
         )
-        loss = paddle.squeeze(loss, axis=[2])
-        max_tar_seq_len = fluid.layers.shape(tar)[1]
+        loss = paddle.squeeze(loss, axes=[2])
+        max_tar_seq_len = paddle.shape(tar)[1]
         tar_mask = fluid.layers.sequence_mask(
             tar_sequence_length, maxlen=max_tar_seq_len, dtype='float32'
         )
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
index 8c3d62feacc62..d16f07d9a2e34 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -210,7 +210,7 @@ def ops(self, input, shape, dtype, value):
         operation
         """
         shape = list(shape)
-        input_shape = fluid.layers.shape(input)
+        input_shape = paddle.shape(input)
         shape[0] = input_shape[0]
         constant = fluid.layers.fill_constant(shape, dtype, value)
         return constant
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index 0cb3e333045f5..b7461b21aa612 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -321,9 +321,7 @@ def bi_loss(pred_score, gt_label):
             gt_label = paddle.reshape(x=gt_label, shape=[-1])
             gt_label.stop_gradient = True
             pmask = fluid.layers.cast(x=(gt_label > 0.5), dtype=DATATYPE)
-            num_entries = fluid.layers.cast(
-                fluid.layers.shape(pmask), dtype=DATATYPE
-            )
+            num_entries = fluid.layers.cast(paddle.shape(pmask), dtype=DATATYPE)
             num_positive = fluid.layers.cast(paddle.sum(pmask), dtype=DATATYPE)
             ratio = num_entries / num_positive
             coef_0 = 0.5 * ratio / (ratio - 1)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
index 742e828aa9acb..597580eedc765 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
@@ -97,8 +97,8 @@ def forward(self, input, max_len=4):
             ),
         }
         # TODO(Aurelius84): The following code will be converted into:
-        # max_len = layers.cond(layers.shape(input)[0] != max_len,
-        #                       lambda: layers.shape(input)[0], lambda: max_len)
+        # max_len = layers.cond(paddle.shape(input)[0] != max_len,
+        #                       lambda: paddle.shape(input)[0], lambda: max_len)
         # But max_len should be wrapped into tensor, which is not supported.
 
         # Comment out this line of code for now.
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index 54b97c9280a40..5aff8c710ae7b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -79,7 +79,7 @@ def forward(self, inputs):
         res = []
         for i in range(inputs.shape[1]):
             if self.is_reverse:
-                j = fluid.layers.shape(inputs)[1] - 1 - i
+                j = paddle.shape(inputs)[1] - 1 - i
             else:
                 j = i
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
index 4e29f2bf6b44a..1ad55d3fbaa41 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
@@ -38,7 +38,7 @@ def dyfunc_tensor_shape_2(x):
 def dyfunc_tensor_shape_3(x):
     # Transform y.shape but run y.shape actually because y is not Tensor
     x = fluid.dygraph.to_variable(x)
-    y = np.ones(5)
+    y = paddle.ones([1, 5])
     res = paddle.reshape(x, shape=y.shape)
     return res
 
@@ -97,7 +97,7 @@ def dyfunc_paddle_shape_api(x):
     a = paddle.shape(x)[0]
     # alias api will also not be converted.
     alias_old_api = paddle.fluid.layers
-    b = alias_old_api.shape(x)[1]
+    b = paddle.shape(x)[1]
     res = paddle.reshape(x, shape=(b, a))
     return res
 
@@ -199,7 +199,7 @@ def dyfunc_with_while_3(x):
 
 def dyfunc_with_while_4(x):
     x = paddle.to_tensor(x)
-    y = np.ones(5)
+    y = paddle.ones([1, 5])
     y_shape_0 = y.shape[0]
     i = 1
 
@@ -309,6 +309,11 @@ class TestTensorShapeBasic3(TestTensorShapeBasic):
     def init_test_func(self):
         self.dygraph_func = dyfunc_tensor_shape_3
 
+    def _set_expected_op_num(self):
+        self.expected_op_num = 3
+        self.expected_shape_op_num = 0
+        self.expected_slice_op_num = 0
+
 
 class TestTensorShapeBasic4(TestTensorShapeBasic):
     def init_test_func(self):
@@ -475,7 +480,7 @@ def init_test_func(self):
         self.dygraph_func = dyfunc_with_while_4
 
     def _set_expected_op_num(self):
-        self.expected_op_num = 4
+        self.expected_op_num = 1
         self.expected_shape_op_num = 0
         self.expected_slice_op_num = 0
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
index 5f894744700f0..5cf1f0f0f533a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
@@ -203,7 +203,7 @@ def __init__(self, scale=2):
 
     def forward(self, inputs):
         # get dynamic upsample output shape
-        shape_nchw = fluid.layers.shape(inputs)
+        shape_nchw = paddle.shape(inputs)
         shape_hw = paddle.slice(shape_nchw, axes=[0], starts=[2], ends=[4])
         shape_hw.stop_gradient = True
         in_shape = fluid.layers.cast(shape_hw, dtype='int32')
diff --git a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py
index 9774ea32e44f6..3e3eefd5d278d 100644
--- a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py
@@ -40,7 +40,7 @@ def build_and_run_program(place, batch_size, beam_size, stop_gradient=False):
     while_op = layers.While(cond)
     scores = layers.array_write(x, step_idx)
     with while_op.block():
-        bs = layers.cast(layers.shape(x)[0], "int64")
+        bs = layers.cast(paddle.shape(x)[0], "int64")
         for _ in range(20):
             bs = layers.cast(bs, 'int64')
         bs.stop_gradient = stop_gradient
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 25b6d0513d1c8..64e671c8ee9bd 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -3307,7 +3307,7 @@ def make_shape(self):
             input = self._get_data(
                 name="input", shape=[3, 100, 100], dtype="float32"
             )
-            out = layers.shape(input)
+            out = paddle.shape(input)
             return out
 
     def make_pad2d(self):
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py b/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py
index 6b2383ed56933..2302e6f0d241f 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py
@@ -635,7 +635,7 @@ def def_seq2seq_model(
         logits=logits, label=label, soft_label=False
     )
     loss = layers.unsqueeze(loss, axes=[2])
-    max_tar_seq_len = layers.shape(target)[1]
+    max_tar_seq_len = paddle.shape(target)[1]
     tar_mask = layers.sequence_mask(
         target_length, maxlen=max_tar_seq_len, dtype="float32"
     )
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index 3b3539c4861f1..67657071db83f 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -248,7 +248,7 @@ def __call__(self, src, src_length, trg=None, trg_length=None):
             ),
         ]
         src_mask = layers.sequence_mask(
-            src_length, maxlen=layers.shape(src)[1], dtype="float32"
+            src_length, maxlen=paddle.shape(src)[1], dtype="float32"
         )
         encoder_padding_mask = (src_mask - 1.0) * 1e9
         encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1])
@@ -400,7 +400,7 @@ def __init__(self, lr=None):
 
     def learn(self, probs, label, weight=None, length=None):
         loss = layers.cross_entropy(input=probs, label=label, soft_label=False)
-        max_seq_len = layers.shape(probs)[1]
+        max_seq_len = paddle.shape(probs)[1]
         mask = layers.sequence_mask(length, maxlen=max_seq_len, dtype="float32")
         loss = loss * mask
         loss = paddle.mean(loss, axis=[0])
diff --git a/python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py b/python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py
index 17e7f69a3b49e..6f70e553cc2bc 100644
--- a/python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py
@@ -23,7 +23,7 @@ def test_static_graph(self):
         data = paddle.fluid.layers.data(
             name="x", shape=[-1, 2], dtype='float32'
         )
-        shape = paddle.fluid.layers.shape(data)  # shape should be [-1, 2]
+        shape = paddle.shape(data)  # shape should be [-1, 2]
         x = paddle.fluid.layers.uniform_random(shape)
         self.assertEqual(x.shape, data.shape)
         paddle.disable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_while_loop_op.py b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
index 8e733ef9208b3..8c94834c9a28b 100644
--- a/python/paddle/fluid/tests/unittests/test_while_loop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
@@ -595,7 +595,7 @@ def body(z, i):
         with program_guard(main_program, startup_program):
             x = fluid.layers.data(name='x', shape=[5], dtype='int32')
             z = fluid.layers.fill_constant([1], 'int32', 0)
-            x_shape = fluid.layers.shape(x)
+            x_shape = paddle.shape(x)
             i = fluid.layers.fill_constant([1], 'int32', 0)
             z, _ = fluid.layers.while_loop(cond, body, [z, i])
 
diff --git a/python/paddle/jit/dy2static/ast_transformer.py b/python/paddle/jit/dy2static/ast_transformer.py
index 2e244d6f34183..826232e723f60 100644
--- a/python/paddle/jit/dy2static/ast_transformer.py
+++ b/python/paddle/jit/dy2static/ast_transformer.py
@@ -127,7 +127,7 @@ def transfer_from_node_type(self, node_wrapper):
         transformers = [
             EarlyReturnTransformer,
             BasicApiTransformer,  # Basic Api
-            TensorShapeTransformer,  # Tensor.shape -> layers.shape(Tensor)
+            TensorShapeTransformer,  # Tensor.shape -> paddle.shape(Tensor)
             BreakContinueTransformer,  # break/continue in loops
             ReturnTransformer,  # return in functions
             LogicalTransformer,  # logical and/or/not
diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py
index fa622b14094de..df20a5c4e0c98 100644
--- a/python/paddle/jit/dy2static/convert_operators.py
+++ b/python/paddle/jit/dy2static/convert_operators.py
@@ -31,7 +31,6 @@
 from paddle.fluid.layers import (
     cast,
     control_flow,
-    nn,
 )
 from paddle.fluid.layers.control_flow import (
     cond,
@@ -524,7 +523,7 @@ def convert_len(var):
             # so we return a variable dynamically inferred from var.shape.
             if var.shape[0] > 0 and var.type == core.VarDesc.VarType.LOD_TENSOR:
                 return var.shape[0]
-            return nn.shape(var)[0]
+            return paddle.shape(var)[0]
         elif var.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
             return paddle.tensor.array_length(var)
         else:
@@ -607,7 +606,7 @@ def has_negative(list_shape):
     if isinstance(x, Variable):
         values = list(x.shape)
         if has_negative(values):
-            shape_tensor = nn.shape(x)
+            shape_tensor = paddle.shape(x)
             for i, v in enumerate(values):
                 if v is None or v < 0:
                     values[i] = shape_tensor[i]
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index d6f08f676b8ae..ac5f99a0b1c3c 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -93,7 +93,7 @@ def shape(input):
             paddle.enable_static()
 
             inputs = fluid.data(name="x", shape=[3, 100, 100], dtype="float32")
-            output = fluid.layers.shape(inputs)
+            output = paddle.shape(inputs)
 
             exe = fluid.Executor(fluid.CPUPlace())
             exe.run(fluid.default_startup_program())

From cee7a3dbe8638ab1dc619bacd2ef679de45d3470 Mon Sep 17 00:00:00 2001
From: ShenLiang <1422485404@qq.com>
Date: Mon, 5 Dec 2022 13:17:24 +0800
Subject: [PATCH 05/13] fix bug of reducer in best_fit (#48668)

---
 .../fluid/distributed/collective/reducer.cc   | 33 ++++++++++++-------
 paddle/fluid/distributed/collective/reducer.h |  2 +-
 .../test_parallel_dygraph_dataparallel.py     | 17 +++++++++-
 3 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index cd8c8ed2e0cc9..379bc57d5594e 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -17,10 +17,16 @@
 #include "paddle/phi/backends/device_manager.h"
 
 DECLARE_bool(use_stream_safe_cuda_allocator);
+DECLARE_string(allocator_strategy);
 
 namespace paddle {
 namespace distributed {
 
+static bool IsStreamSafeAllocator() {
+  return FLAGS_allocator_strategy == "auto_growth" &&
+         FLAGS_use_stream_safe_cuda_allocator;
+}
+
 static Backend TransToBackend(platform::Place place) {
   static const std::map<phi::AllocationType, Backend> type_backend = {
       {phi::AllocationType::GPU, Backend::GPU},
@@ -399,14 +405,14 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
   }
 }
 
-void EagerGroup::SplitTensorsDev(const platform::DeviceContext &context) {
+void EagerGroup::SplitTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto &gpu_context = static_cast<const phi::GPUContext &>(context);
     SplitTensorsWithType(
         gpu_context, &dense_contents_, &dense_tensors_, dtype_);
-    if (FLAGS_use_stream_safe_cuda_allocator) {
+    if (IsStreamSafeAllocator()) {
       auto dense_tensor =
           std::dynamic_pointer_cast<phi::DenseTensor>(dense_contents_.impl());
       VLOG(3) << "Free dense_contents_ " << dense_contents_.numel();
@@ -1011,12 +1017,11 @@ void EagerReducer::FinalizeBackward() {
   for (auto &group : groups_) {
     if (!group.is_sparse_) {
       group.task->Synchronize();
-    }
-  }
-
-  for (auto &group : groups_) {
-    if (!group.is_sparse_) {
-      group.dense_contents_.reset();
+      if (!IsStreamSafeAllocator()) {
+        auto *default_ctx =
+            platform::DeviceContextPool::Instance().Get(inner_place_);
+        group.SplitTensors(*default_ctx);
+      }
     }
   }
 
@@ -1054,9 +1059,15 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
   group->task = process_group_->AllReduce(in_out, in_out, opts);
 
   auto *context = process_group_->GetDeviceContext(inner_place_);
-  group->SplitTensorsDev(*context);
-  group->task->UpdateWaitChain(*context);
-  // split in FinalizeBackward()
+
+  if (IsStreamSafeAllocator()) {
+    // NOTE(shenliang03): The best_fit allocator strategy is multi-stream
+    // insecure. In the Split operator, additional memory will be applied for
+    // calculation, and if it is asynchronous, an illegal memory access may be
+    // encountered.
+    group->SplitTensors(*context);
+    group->task->UpdateWaitChain(*context);
+  }
 }
 
 void EagerReducer::AllReduceSparse(EagerGroup *group,
diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h
index 5d27086fdbec5..5be2d60a6a654 100644
--- a/paddle/fluid/distributed/collective/reducer.h
+++ b/paddle/fluid/distributed/collective/reducer.h
@@ -75,7 +75,7 @@ class EagerGroup {
 
   // context is used to select the stream for split
 
-  void SplitTensorsDev(const platform::DeviceContext &);
+  void SplitTensors(const platform::DeviceContext &);
 
   friend std::ostream &operator<<(std::ostream &, const EagerGroup &);
 };
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
index 9e4be19dacade..5fd7f3beb117e 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -103,6 +103,7 @@ def start_local_trainers(
     training_script,
     training_script_args,
     eager_mode=True,
+    allocator_strategy="auto_growth",
     log_dir=None,
 ):
     current_env = copy.copy(os.environ.copy())
@@ -126,6 +127,10 @@ def start_local_trainers(
         if not eager_mode:
             proc_env["FLAGS_enable_eager_mode"] = "%d" % 0
 
+        proc_env["FLAGS_allocator_strategy"] = allocator_strategy
+        if allocator_strategy == "auto_growth":
+            proc_env["FLAGS_fraction_of_gpu_memory_to_use"] = "0.1"
+
         current_env.update(proc_env)
 
         print("trainer proc env:{}".format(current_env))
@@ -153,7 +158,12 @@ def start_local_trainers(
 
 
 class TestMultipleGpus(unittest.TestCase):
-    def run_mnist_2gpu(self, target_file_name, eager_mode=True):
+    def run_mnist_2gpu(
+        self,
+        target_file_name,
+        eager_mode=True,
+        allocator_strategy="auto_growth",
+    ):
         if (
             not fluid.core.is_compiled_with_cuda()
             or fluid.core.get_cuda_device_count() == 0
@@ -170,6 +180,7 @@ def run_mnist_2gpu(self, target_file_name, eager_mode=True):
             cluster,
             pod,
             eager_mode=eager_mode,
+            allocator_strategy=allocator_strategy,
             training_script=target_file_name,
             training_script_args=[],
         )
@@ -218,6 +229,10 @@ def test_parallel_dygraph_dataparallel_with_pylayer(self):
         self.run_mnist_2gpu(
             'parallel_dygraph_dataparallel_with_pylayer.py', eager_mode=False
         )
+        self.run_mnist_2gpu(
+            'parallel_dygraph_dataparallel_with_pylayer.py',
+            allocator_strategy="naive_best_fit",
+        )
 
 
 class TestGradientCheckInEagerMode(TestMultipleGpus):

From d6aa0d43bc26f0e472a996b4def4ae81f78bfbd4 Mon Sep 17 00:00:00 2001
From: Netpunk <69072522+Patrick-Star125@users.noreply.github.com>
Date: Mon, 5 Dec 2022 13:43:54 +0800
Subject: [PATCH 06/13] [PHI decoupling] migrate poly_util.h to phi (#48499)

* rm poly_util.h

* format code

* fix some problems

* format code
---
 paddle/phi/kernels/funcs/detection/nms_util.h        |  8 ++++----
 .../kernels/funcs}/detection/poly_util.cc            | 12 +++++-------
 .../kernels/funcs}/detection/poly_util.h             | 11 +++++------
 3 files changed, 14 insertions(+), 17 deletions(-)
 rename paddle/{fluid/operators => phi/kernels/funcs}/detection/poly_util.cc (95%)
 rename paddle/{fluid/operators => phi/kernels/funcs}/detection/poly_util.h (91%)

diff --git a/paddle/phi/kernels/funcs/detection/nms_util.h b/paddle/phi/kernels/funcs/detection/nms_util.h
index e862b2a90f06c..4e2398fbb9651 100644
--- a/paddle/phi/kernels/funcs/detection/nms_util.h
+++ b/paddle/phi/kernels/funcs/detection/nms_util.h
@@ -17,9 +17,9 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/operators/detection/poly_util.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/device_context.h"
+#include "paddle/phi/kernels/funcs/detection/poly_util.h"
 
 namespace phi {
 namespace funcs {
@@ -96,10 +96,10 @@ T PolyIoU(const T* box1,
           const T* box2,
           const size_t box_size,
           const bool normalized) {
-  T bbox1_area = paddle::operators::PolyArea<T>(box1, box_size, normalized);
-  T bbox2_area = paddle::operators::PolyArea<T>(box2, box_size, normalized);
+  T bbox1_area = phi::funcs::PolyArea<T>(box1, box_size, normalized);
+  T bbox2_area = phi::funcs::PolyArea<T>(box2, box_size, normalized);
   T inter_area =
-      paddle::operators::PolyOverlapArea<T>(box1, box2, box_size, normalized);
+      phi::funcs::PolyOverlapArea<T>(box1, box2, box_size, normalized);
   if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) {
     // If coordinate values are invalid
     // if area size <= 0,  return 0.
diff --git a/paddle/fluid/operators/detection/poly_util.cc b/paddle/phi/kernels/funcs/detection/poly_util.cc
similarity index 95%
rename from paddle/fluid/operators/detection/poly_util.cc
rename to paddle/phi/kernels/funcs/detection/poly_util.cc
index d8beabd0a04b3..fd8037a82eabc 100644
--- a/paddle/fluid/operators/detection/poly_util.cc
+++ b/paddle/phi/kernels/funcs/detection/poly_util.cc
@@ -14,12 +14,10 @@ limitations under the License. */
 #ifndef POLY_UTIL_CC_
 #define POLY_UTIL_CC_
 
-#include "paddle/fluid/operators/detection/poly_util.h"
+#include "paddle/phi/kernels/funcs/detection/poly_util.h"
 
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {
 
 using phi::funcs::gpc_free_polygon;
 using phi::funcs::gpc_polygon_clip;
@@ -134,7 +132,7 @@ T PolyOverlapArea(const T* box1,
   return inter_area;
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
 
 #endif
diff --git a/paddle/fluid/operators/detection/poly_util.h b/paddle/phi/kernels/funcs/detection/poly_util.h
similarity index 91%
rename from paddle/fluid/operators/detection/poly_util.h
rename to paddle/phi/kernels/funcs/detection/poly_util.h
index ad7611c165b79..6d527d2d95f9e 100644
--- a/paddle/fluid/operators/detection/poly_util.h
+++ b/paddle/phi/kernels/funcs/detection/poly_util.h
@@ -15,11 +15,10 @@ limitations under the License. */
 
 #include <vector>
 
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/gpc.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {
 
 template <class T>
 class Point_ {
@@ -70,7 +69,7 @@ T PolyOverlapArea(const T* box1,
                   const T* box2,
                   const size_t box_size,
                   const bool normalized);
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
 
-#include "paddle/fluid/operators/detection/poly_util.cc"
+#include "paddle/phi/kernels/funcs/detection/poly_util.cc"

From 93027d9f57112bc05e6777c153896ed6eb913db1 Mon Sep 17 00:00:00 2001
From: heyanru <81976792+heyanru01@users.noreply.github.com>
Date: Mon, 5 Dec 2022 14:01:51 +0800
Subject: [PATCH 07/13] [Fluid Clean] remove nn.topk, nn.ctc_greedy_decoder,
 nn.im2sequence, nn.multiplex, nn.smooth_l1 (#48289)

---
 .../phi/kernels/funcs/transpose_functor.cu.h  |   2 +-
 python/paddle/fluid/layers/detection.py       |   3 +-
 python/paddle/fluid/layers/nn.py              | 579 ------------------
 .../fluid/tests/unittests/dist_transformer.py |   4 +-
 .../seq2seq_dygraph_model.py                  |   4 +-
 .../transformer_dygraph_model.py              |   4 +-
 .../tests/unittests/ipu/test_topk_op_ipu.py   |   2 +-
 .../npu/test_smooth_l1_loss_op_npu.py         |  17 -
 .../tests/unittests/test_beam_search_op.py    |   2 +-
 .../fluid/tests/unittests/test_ctc_align.py   |  45 --
 .../fluid/tests/unittests/test_layers.py      |  35 +-
 .../tests/unittests/test_smooth_l1_loss_op.py |  19 -
 12 files changed, 15 insertions(+), 701 deletions(-)

diff --git a/paddle/phi/kernels/funcs/transpose_functor.cu.h b/paddle/phi/kernels/funcs/transpose_functor.cu.h
index 0d24fdebef148..8dae6ab60e99a 100644
--- a/paddle/phi/kernels/funcs/transpose_functor.cu.h
+++ b/paddle/phi/kernels/funcs/transpose_functor.cu.h
@@ -475,7 +475,7 @@ void SwapDim1And2InNarrow(const phi::GPUContext& d,
         CeilOrFloor<int, false>(input_long_edge, proposed_tile_long_edge) *
             proposed_tile_long_edge;
 
-    int num_full_tiles = 
+    int num_full_tiles =
         CeilOrFloor<int, false>(input_long_edge, proposed_tile_long_edge);
 
     float cost = num_wasted_threads;
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index dddd2fa386d2b..486daac6092c6 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -1688,7 +1688,8 @@ def __reshape_to_2d(var):
     location = __reshape_to_2d(location)
     target_bbox = __reshape_to_2d(target_bbox)
 
-    loc_loss = nn.smooth_l1(location, target_bbox)
+    smooth_l1_loss = paddle.nn.loss.SmoothL1Loss()
+    loc_loss = smooth_l1_loss(location, target_bbox)
     target_loc_weight = __reshape_to_2d(target_loc_weight)
     loc_loss = loc_loss * target_loc_weight
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index fda4c24e51068..a4125088c8a2f 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -72,16 +72,11 @@
     'batch_norm',
     'dropout',
     'split',
-    'ctc_greedy_decoder',
     'l2_normalize',
     'matmul',
-    'topk',
-    'im2sequence',
     'row_conv',
-    'multiplex',
     'layer_norm',
     'spectral_norm',
-    'smooth_l1',
     'one_hot',
     'autoincreased_step_counter',
     'unsqueeze',
@@ -2751,421 +2746,6 @@ def __check_input(x, y):
     return out
 
 
-def topk(input, k, name=None):
-    """
-    :alias_main: paddle.topk
-        :alias: paddle.topk,paddle.tensor.topk,paddle.tensor.search.topk
-        :old_api: paddle.fluid.layers.topk
-
-    This OP is used to find values and indices of the k largest entries
-    for the last dimension.
-
-    If the input is a 1-D Tensor, finds the k largest entries and outputs
-    their values and indices.
-
-    If the input is a Tensor with higher rank, this operator computes the top k
-    entries along the last dimension.
-
-    .. code-block:: text
-
-        Case 1:
-
-          Input:
-            input.shape = [3, 4]
-            input.data = [[5, 4, 2, 3],
-                     [9, 7, 10, 25],
-                     [6, 2, 10, 1]]
-            k = 2
-
-          Output:
-            The first output:
-            values.shape = [3, 2]
-            values.data = [[5, 4],
-                      [10, 25],
-                      [6, 10]]
-
-            The second output:
-            indices.shape = [3, 2]
-            indices.data = [[0, 1],
-                       [2, 3],
-                       [0, 2]]
-
-    Args:
-        input(Variable): The input tensor. Support data types: float32, float64.
-        k(int | Variable): The number of top elements to look for along the last dimension
-                           of input tensor.
-        name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.
-
-    Returns:
-        Values (Variable): Input tensor's k largest elements along each last dimensional slice. The dimension is: :math:`input.shape[:-1]+[k]`.
-        Indices (Variable): Indices of k largest elements alone the last dimension of input. The dimension is same as values.
-
-    Raises:
-        ValueError: If :math:`k < 1` or :math:`k > last dimension of input`.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            # set batch size=None
-            input = fluid.data(name="input", shape=[None, 13, 11], dtype='float32')
-            top5_values, top5_indices = layers.topk(input, k=5) # top5_values.shape[None, 13, 5], top5_indices.shape=[None, 13, 5]
-
-            # 1D Tensor
-            input1 = fluid.data(name="input1", shape=[None, 13], dtype='float32')
-            top5_values, top5_indices = layers.topk(input1, k=5) #top5_values.shape=[None, 5], top5_indices.shape=[None, 5]
-
-            # k=Variable
-            input2 = fluid.data(name="input2", shape=[None, 13, 11], dtype='float32')
-            vk = fluid.data(name="vk", shape=[None, 1], dtype='int32') # save k in vk.data[0]
-            vk_values, vk_indices = layers.topk(input2, k=vk) #vk_values.shape=[None, 13, k], vk_indices.shape=[None, 13, k]
-
-    """
-    if _non_static_mode():
-        _k = k.numpy().item(0) if isinstance(k, Variable) else k
-        out, indices = _legacy_C_ops.top_k(input, 'k', _k)
-        out.stop_gradient = True
-        indices.stop_gradient = True
-        return out, indices
-
-    inputs = {"X": [input]}
-    attrs = {}
-    if isinstance(k, Variable):
-        inputs['K'] = [k]
-    else:
-        attrs = {'k': k}
-
-    helper = LayerHelper("top_k", **locals())
-    values = helper.create_variable_for_type_inference(dtype=input.dtype)
-    indices = helper.create_variable_for_type_inference(dtype="int64")
-
-    helper.append_op(
-        type="top_k",
-        inputs=inputs,
-        outputs={"Out": [values], "Indices": [indices]},
-        attrs=attrs,
-    )
-    values.stop_gradient = True
-    indices.stop_gradient = True
-    return values, indices
-
-
-def ctc_greedy_decoder(
-    input, blank, input_length=None, padding_value=0, name=None
-):
-    r"""
-    This op is used to decode sequences by greedy policy by the following steps:
-
-    1. Get the indexes of maximum value for each row in input. a.k.a.
-       numpy.argmax(input, axis=0).
-    2. For each sequence in result of step1, merge repeated tokens between two
-       blanks and delete all blanks.
-
-    This op is implemented in two modes: lod and padding, either of them can be used.
-    The input can be either LoDTensor or Tensor, corresponding to lod and padding
-    mode respectively.
-
-    A simple example as below:
-
-    .. code-block:: text
-
-        Given:
-        (1) for lod mode:
-
-        input.data = [[0.6, 0.1, 0.3, 0.1],
-                      [0.3, 0.2, 0.4, 0.1],
-                      [0.1, 0.5, 0.1, 0.3],
-                      [0.5, 0.1, 0.3, 0.1],
-
-                      [0.5, 0.1, 0.3, 0.1],
-                      [0.2, 0.2, 0.2, 0.4],
-                      [0.2, 0.2, 0.1, 0.5],
-                      [0.5, 0.1, 0.3, 0.1]]
-
-        input.lod = [[4, 4]]
-
-        Computation:
-
-        step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
-               [[0], [2], [1], [0]]
-        step2: merge repeated tokens and remove blank which is 0. Then we get first output sequence:
-               [[2], [1]]
-
-        Finally:
-
-        output.data = [[2],
-                       [1],
-                       [3]]
-
-        output.lod = [[2, 1]]
-
-        (2) for padding mode:
-
-         input.data = [[[0.6, 0.1, 0.3, 0.1],
-                        [0.3, 0.2, 0.4, 0.1],
-                        [0.1, 0.5, 0.1, 0.3],
-                        [0.5, 0.1, 0.3, 0.1]],
-
-                       [[0.5, 0.1, 0.3, 0.1],
-                        [0.2, 0.2, 0.2, 0.4],
-                        [0.2, 0.2, 0.1, 0.5],
-                        [0.5, 0.1, 0.3, 0.1]]]
-
-        input_length.data = [[4], [4]]
-        input.shape = [2, 4, 4]
-
-        step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
-               [[0], [2], [1], [0]], for input.data[4:8] is [[0], [3], [3], [0]], shape is [2,4,1]
-        step2: Change the argmax result to use padding mode, then argmax result is
-                [[0, 2, 1, 0], [0, 3, 3, 0]], shape is [2, 4], lod is [], input_length is [[4], [4]]
-        step3: Apply ctc_align to padding argmax result, padding_value is 0
-
-        Finally:
-        output.data = [[2, 1, 0, 0],
-                       [3, 0, 0, 0]]
-        output_length.data = [[2], [1]]
-
-
-    Parameters:
-
-        input(Variable): the probabilities of variable-length sequences. When in lod mode,
-                         it is a 2-D LoDTensor with LoD information. It's shape is [Lp, num_classes + 1]
-                         where Lp is the sum of all input sequences' length and
-                         num_classes is the true number of classes. When in padding mode,
-                         it is a 3-D Tensor with padding, It's shape is [batch_size, N, num_classes + 1].
-                         (not including the blank label). The data type can be float32 or float64.
-        blank(int): the blank label index of Connectionist Temporal
-                    Classification (CTC) loss, which is in the half-opened
-                    interval [0, num_classes + 1).
-        input_length(Variable, optional): 2-D LoDTensor, shape is [batch_size, 1], data type is int64.
-                                 It is used for padding mode. In lod mode, input_length is None.
-        padding_value(int): padding value.
-        name(str, optional): The default value is None.
-                             Normally there is no need for user to set this property.
-                             For more information, please refer to :ref:`api_guide_Name`
-
-    Returns:
-        For lod mode, returns the result of CTC greedy decoder, 2-D LoDTensor, shape is [Lp, 1], \
-        data type is int64. 'Lp' is the sum of all output sequences' length. If all the sequences \
-        in result were empty, the result LoDTensor will be [-1] with  empty \
-        LoD [[]].
-
-        For padding mode, returns a tuple of (output, output_length), which was described as below:
-
-        output, 2-D Tensor, shape is [batch_size, N], data type is int64.
-
-        output_length, 2-D Tensor, shape is [batch_size, 1], data type is int64. It is the length of \
-                           each sequence of output for padding mode.
-
-    Return type:
-        For lod mode: Variable
-
-        For padding mode: tuple of two Variables (output, output_length).
-
-
-    Examples:
-        .. code-block:: python
-
-            # for lod mode
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[None, 8], dtype='float32', lod_level=1)
-            cost = fluid.layers.ctc_greedy_decoder(input=x, blank=0)
-
-            # for padding mode
-            x_pad = fluid.data(name='x_pad', shape=[10, 4, 8], dtype='float32')
-            x_pad_len = fluid.data(name='x_pad_len', shape=[10, 1], dtype='int64')
-            out, out_len = fluid.layers.ctc_greedy_decoder(input=x_pad, blank=0,
-                            input_length=x_pad_len)
-
-    """
-    check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'ctc_greedy_decoder'
-    )
-
-    helper = LayerHelper("ctc_greedy_decoder", **locals())
-    _, topk_indices = topk(input, k=1)
-
-    # ctc align op
-    ctc_out = helper.create_variable_for_type_inference(dtype="int64")
-
-    if input_length is None:
-        helper.append_op(
-            type="ctc_align",
-            inputs={"Input": [topk_indices]},
-            outputs={"Output": [ctc_out]},
-            attrs={"merge_repeated": True, "blank": blank},
-        )
-        return ctc_out
-    else:
-        ctc_out_len = helper.create_variable_for_type_inference(dtype="int64")
-        ctc_input = paddle.squeeze(topk_indices, [2])
-
-        helper.append_op(
-            type="ctc_align",
-            inputs={"Input": [ctc_input], "InputLength": [input_length]},
-            outputs={"Output": [ctc_out], "OutputLength": [ctc_out_len]},
-            attrs={
-                "merge_repeated": True,
-                "blank": blank,
-                "padding_value": padding_value,
-            },
-        )
-        return ctc_out, ctc_out_len
-
-
-def im2sequence(
-    input,
-    filter_size=1,
-    stride=1,
-    padding=0,
-    input_image_size=None,
-    out_stride=1,
-    name=None,
-):
-    r"""
-    :api_attr: Static Graph
-
-    Extracts image patches from the input tensor to form a tensor of shape
-    {input.batch_size * output_height * output_width, filter_size_height *
-    filter_size_width * input.channels}. This op use filter to scan images
-    and convert these images to sequences. After expanding, the number of time step are
-    output_height * output_width for an image, in which output_height and
-    output_width are calculated by below equation:
-
-    .. math::
-
-        output\_height  = 1 + \
-            (padding\_up + padding\_down + input\_height  - filter\_size\_height  + stride\_height - 1) / stride\_height \\\\
-        output\_width  = 1 + \
-            (padding\_left + padding\_right + input\_width  - filter\_size\_width  + stride\_width - 1) / stride\_width
-
-    And the dimension of each time step is filter_size_height * filter_size_width * input.channels.
-
-    Parameters:
-        input (Variable): The input should be a 4-D Tensor in :math:`NCHW` format. The data type is float32.
-
-        filter_size(int32 | List[int32]): The filter size. If filter_size is a List,
-            it must contain two integers, :math:`[filter\_size\_height, filter\_size\_width]` .
-            Otherwise, the filter size will be a square :math:`[filter\_size, filter\_size]` . Default is 1.
-
-        stride(int32 | List[int32]): The stride size. If stride is a List, it must
-            contain two integers, :math:`[stride\_height, stride\_width]` . Otherwise, the stride size will be a square :math:`[stride\_size, stride\_size]` . Default is 1.
-
-        padding(int32 | List[int32]): The padding size. If padding is a List, it can
-            contain four integers like :math:`[padding\_up, padding\_left, padding\_down, padding\_right]` to indicate
-            paddings of four direction.  Or it can contain two integers :math:`[padding\_height, padding\_width]` which means
-            padding_up = padding_down = padding_height and
-            padding_left = padding_right = padding_width. Otherwise, a scalar padding means
-            padding_up = padding_down = padding_left = padding_right = padding.
-            Default is 0.
-
-        input_image_size(Variable, optional): the input contains image real size.It's dim
-            is :math:`[batchsize, 2]` . It is just for batch inference when not None. Default is None.
-
-        out_stride(int32 | List[int32]): The scaling of image through CNN. It is valid only when input_image_size is not None.
-            If out_stride is List,  it must contain two integers,
-            :math:`[out\_stride\_height, out\_stride\_W]` . Otherwise,
-            the out_stride_height = out_stride_width = out_stride. Default is 1.
-
-        name (str, optional): The default value is None.  Normally there is no need for
-                    user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
-
-    Returns:
-            The output is a 2-D LoDTensor with shape {input.batch\_size * output\_height * output\_width, \
-            filter\_size\_height * filter\_size\_width * input.channels}. The data type is float32.
-
-    Return Type: Variable
-
-    Examples:
-
-        .. code-block:: text
-
-            Given:
-
-            x = [[[[ 6.  2.  1.]
-                   [ 8.  3.  5.]
-                   [ 0.  2.  6.]]
-
-                  [[ 2.  4.  4.]
-                   [ 6.  3.  0.]
-                   [ 6.  4.  7.]]]
-
-                 [[[ 6.  7.  1.]
-                   [ 5.  7.  9.]
-                   [ 2.  4.  8.]]
-
-                  [[ 1.  2.  1.]
-                   [ 1.  3.  5.]
-                   [ 9.  0.  8.]]]]
-
-            x.dims = {2, 2, 3, 3}
-
-            And:
-
-            filter = [2, 2]
-            stride = [1, 1]
-            padding = [0, 0]
-
-            Then:
-
-            output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
-                           [ 2.  1.  3.  5.  4.  4.  3.  0.]
-                           [ 8.  3.  0.  2.  6.  3.  6.  4.]
-                           [ 3.  5.  2.  6.  3.  0.  4.  7.]
-                           [ 6.  7.  5.  7.  1.  2.  1.  3.]
-                           [ 7.  1.  7.  9.  2.  1.  3.  5.]
-                           [ 5.  7.  2.  4.  1.  3.  9.  0.]
-                           [ 7.  9.  4.  8.  3.  5.  0.  8.]]
-
-            output.dims = {8, 8}
-
-            output.lod = [[4, 4]]
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle
-            paddle.enable_static()
-            data = fluid.data(name='data', shape=[None, 3, 32, 32],
-                                     dtype='float32')
-            output = fluid.layers.im2sequence(
-                input=data, stride=[1, 1], filter_size=[2, 2])
-
-
-    """
-    assert (
-        not _non_static_mode()
-    ), "sequence layer is not supported in dygraph mode yet."
-
-    check_variable_and_dtype(input, 'input', ['float32'], 'im2sequence')
-
-    if isinstance(filter_size, int):
-        filter_size = [filter_size, filter_size]
-    if isinstance(stride, int):
-        stride = [stride, stride]
-    if isinstance(padding, int):
-        padding = [padding, padding]
-    if len(padding) == 2:
-        padding.append(padding[0])
-        padding.append(padding[1])
-    inputs = {"X": input}
-    attrs = {"kernels": filter_size, "strides": stride, "paddings": padding}
-    if input_image_size:
-        if isinstance(out_stride, int):
-            out_stride = [out_stride, out_stride]
-        inputs["Y"] = input_image_size
-        attrs["out_stride"] = out_stride
-    helper = LayerHelper('im2sequence', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs
-    )
-    return out
-
-
 @templatedoc()
 def row_conv(input, future_context_size, param_attr=None, act=None):
     """
@@ -3214,165 +2794,6 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
     return helper.append_activation(out)
 
 
-@templatedoc()
-def multiplex(inputs, index, name=None):
-    """
-
-    Based on the given index parameter, the OP selects a specific row from each input Tensor to construct the output Tensor.
-
-    If the input of this OP contains :math:`m` Tensors, where :math:`I_{i}` means the i-th input Tensor, :math:`i` between :math:`[0,m)` .
-
-    And :math:`O` means the output, where :math:`O[i]` means the i-th row of the output, then the output satisfies that :math:`O[i] = I_{index[i]}[i]` .
-
-    For Example:
-
-            .. code-block:: text
-
-                Given:
-
-                inputs = [[[0,0,3,4], [0,1,3,4], [0,2,4,4], [0,3,3,4]],
-                          [[1,0,3,4], [1,1,7,8], [1,2,4,2], [1,3,3,4]],
-                          [[2,0,3,4], [2,1,7,8], [2,2,4,2], [2,3,3,4]],
-                          [[3,0,3,4], [3,1,7,8], [3,2,4,2], [3,3,3,4]]]
-
-                index = [[3],[0],[1],[2]]
-
-                out = [[3,0,3,4],    # out[0] = inputs[index[0]][0] = inputs[3][0] = [3,0,3,4]
-                       [0,1,3,4],    # out[1] = inputs[index[1]][1] = inputs[0][1] = [0,1,3,4]
-                       [1,2,4,2],    # out[2] = inputs[index[2]][2] = inputs[1][2] = [1,2,4,2]
-                       [2,3,3,4]]    # out[3] = inputs[index[3]][3] = inputs[2][3] = [2,3,3,4]
-
-
-    Args:
-        inputs (list): The input Tensor list. The list elements are N-D Tensors of data types float32, float64, int32, int64. All input Tensor shapes should be the same and rank must be at least 2.
-        index (Tensor): Used to select some rows in the input Tensor to construct an index of the output Tensor. It is a 2-D Tensor with data type int32 or int64 and shape [M, 1], where M is the number of input Tensors.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
-    Returns:
-        Tensor: Output of multiplex OP, with data type being float32, float64, int32, int64.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import numpy as np
-            img1 = np.array([[1, 2], [3, 4]]).astype(np.float32)
-            img2 = np.array([[5, 6], [7, 8]]).astype(np.float32)
-            inputs = [paddle.to_tensor(img1), paddle.to_tensor(img2)]
-            index = paddle.to_tensor(np.array([[1], [0]]).astype(np.int32))
-            res = paddle.multiplex(inputs, index)
-            print(res) # [array([[5., 6.], [3., 4.]], dtype=float32)]
-
-    """
-
-    if _in_legacy_dygraph():
-        return _legacy_C_ops.multiplex(index, inputs)
-    if in_dygraph_mode():
-        return _C_ops.multiplex(inputs, index)
-    helper = LayerHelper('multiplex', **locals())
-
-    check_type(inputs, 'inputs', (list), 'multiplex')
-    if len(inputs) < 2:
-        raise ValueError(
-            "inputs should be a list object with at least 2 elements."
-        )
-    for id, x in enumerate(inputs):
-        check_variable_and_dtype(
-            x,
-            'input[' + str(id) + ']',
-            ['float32', 'float64', 'int32', 'int64'],
-            'multiplex',
-        )
-    check_variable_and_dtype(index, "index", ['int32', 'int64'], 'multiplex')
-
-    out = helper.create_variable_for_type_inference(inputs[0].dtype)
-    helper.append_op(
-        type='multiplex',
-        inputs={'X': inputs, 'Ids': index},
-        outputs={'Out': [out]},
-    )
-    return out
-
-
-def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
-    """
-
-    This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
-    It takes the first dimension of :attr:`x` and :attr:`y` as batch size.
-    For each instance, it computes the smooth L1 loss element by element first
-    and then sums all the losses. So the shape of output Variable is
-    [batch_size, 1].
-
-    Args:
-        x (Variable): A tensor with rank at least 2. The input value of smooth
-            L1 loss op with shape [batch_size, dim1, ..., dimN].
-            A LoDTensor or Tensor with type float32.
-        y (Variable): A tensor with rank at least 2. The target value of smooth
-            L1 loss op with same shape as :attr:`x`.
-            A LoDTensor or Tensor with type float32.
-        inside_weight (Variable|None):  A tensor with rank at least 2. This
-            input is optional and should have same shape with :attr:`x`. If
-            provided, the result of (:attr:`x` - :attr:`y`) will be multiplied
-            by this tensor element by element.
-            A Tensor with type float32.
-        outside_weight (Variable|None): A tensor with rank at least 2. This
-            input is optional and should have same shape with :attr:`x`. If
-            provided, the out smooth L1 loss will be multiplied by this tensor
-            element by element.
-            A Tensor with type float32.
-        sigma (float|None): Hyper parameter of smooth L1 loss layer. A float
-           scalar with default value 1.0.
-
-    Returns:
-        Variable: The output smooth L1 loss with shape [batch_size, 1].  A Tensor with type float32.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy as np
-            import paddle
-            paddle.enable_static()
-            data = fluid.data(name="x", shape=[-1, 3], dtype="float32")
-            label = fluid.data(name="y", shape=[-1, 3], dtype="float32")
-            result = fluid.layers.smooth_l1(data,label)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            x = np.random.rand(3,3).astype("float32")
-            y = np.random.rand(3,3).astype("float32")
-            output= exe.run(feed={"x":x, "y":y},
-                             fetch_list=[result])
-            print(output)
-
-            #[array([[0.08220536],
-            #       [0.36652038],
-            #      [0.20541131]], dtype=float32)]
-
-    """
-    check_variable_and_dtype(x, 'X', ['float32', 'float64'], 'smooth_l1_loss')
-    check_variable_and_dtype(y, 'Y', ['float32', 'float64'], 'smooth_l1_loss')
-
-    helper = LayerHelper('smooth_l1_loss', **locals())
-
-    diff = helper.create_variable_for_type_inference(dtype=x.dtype)
-    loss = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='smooth_l1_loss',
-        inputs={
-            'X': x,
-            'Y': y,
-            'InsideWeight': inside_weight,
-            'OutsideWeight': outside_weight,
-        },
-        outputs={'Diff': diff, 'Out': loss},
-        attrs={'sigma': sigma if sigma is not None else 1.0},
-    )
-    return loss
-
-
 @deprecated(since='2.0.0', update_to='paddle.nn.functional.one_hot')
 def one_hot(input, depth, allow_out_of_range=False):
     """
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index cb60e1c599114..c6165dd753537 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -1833,8 +1833,8 @@ def beam_search():
             )
             logits = paddle.reshape(logits, (-1, trg_vocab_size))
 
-            topk_scores, topk_indices = layers.topk(
-                input=paddle.nn.functional.softmax(logits), k=beam_size
+            topk_scores, topk_indices = paddle.topk(
+                x=paddle.nn.functional.softmax(logits), k=beam_size
             )
             accu_scores = layers.elementwise_add(
                 x=paddle.log(topk_scores),
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
index 5c04aecd1a87f..5babde40b4355 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
@@ -459,9 +459,7 @@ def beam_search(self, inputs):
             scores = paddle.reshape(
                 log_probs, [-1, self.beam_size * self.tar_vocab_size]
             )
-            topk_scores, topk_indices = fluid.layers.topk(
-                input=scores, k=self.beam_size
-            )
+            topk_scores, topk_indices = paddle.topk(x=scores, k=self.beam_size)
 
             beam_indices = paddle.floor_divide(topk_indices, vocab_size_tensor)
             token_indices = paddle.remainder(topk_indices, vocab_size_tensor)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
index e6f03170b4734..16449d00ae736 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
@@ -853,9 +853,7 @@ def gather(input, indices, batch_pos):
                 log_probs, [-1, beam_size * self.trg_vocab_size]
             )
             scores = log_probs
-            topk_scores, topk_indices = fluid.layers.topk(
-                input=scores, k=beam_size
-            )
+            topk_scores, topk_indices = paddle.topk(x=scores, k=beam_size)
             beam_indices = paddle.floor_divide(topk_indices, vocab_size_tensor)
             token_indices = paddle.remainder(topk_indices, vocab_size_tensor)
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
index dbcbe6e393725..6a302a7cb5a89 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
@@ -31,7 +31,7 @@ def setUp(self):
         self.set_op_attrs()
 
     def set_test_op(self):
-        self.op = paddle.fluid.layers.topk
+        self.op = paddle.topk
 
     def set_data_feed(self):
         data = np.random.uniform(size=[3, 5])
diff --git a/python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py
index 54e625a9f4a91..6da0f7f36a133 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py
@@ -138,22 +138,5 @@ def test_check_grad_ingore_y(self):
         )
 
 
-class TestSmoothL1LossOpError(unittest.TestCase):
-    def test_errors(self):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            # The input type of accuracy_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.NPUPlace(0)
-            )
-            y1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.NPUPlace(0)
-            )
-            self.assertRaises(TypeError, fluid.layers.smooth_l1, x1, y1)
-            # The input dtype of accuracy_op must be float32 or float64.
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
-            y2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
-            self.assertRaises(TypeError, fluid.layers.smooth_l1, x2, y2)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
index bc737a5ed55f4..d492560a50972 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -312,7 +312,7 @@ def test_errors(self):
                 name='pre_scores', shape=[1], lod_level=2, dtype='float32'
             )
             probs = fluid.data(name='probs', shape=[10000], dtype='float32')
-            topk_scores, topk_indices = fluid.layers.topk(probs, k=4)
+            topk_scores, topk_indices = paddle.topk(probs, k=4)
             accu_scores = fluid.layers.elementwise_add(
                 x=paddle.log(x=topk_scores),
                 y=paddle.reshape(pre_scores, shape=[-1]),
diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py
index 18d8cb35e6ebe..9e76d29a775c8 100644
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
@@ -18,7 +18,6 @@
 from op_test import OpTest
 
 import paddle
-import paddle.fluid as fluid
 
 
 def CTCAlign(input, lod, blank, merge_repeated, padding=0, input_length=None):
@@ -226,50 +225,6 @@ def config(self):
         )
 
 
-class TestCTCAlignOpApi(unittest.TestCase):
-    def test_api(self):
-        x = fluid.layers.data('x', shape=[4], dtype='float32')
-        y = fluid.layers.ctc_greedy_decoder(x, blank=0)
-
-        x_pad = fluid.layers.data('x_pad', shape=[4, 4], dtype='float32')
-        x_pad_len = fluid.layers.data('x_pad_len', shape=[1], dtype='int64')
-        y_pad, y_pad_len = fluid.layers.ctc_greedy_decoder(
-            x_pad, blank=0, input_length=x_pad_len
-        )
-
-        place = fluid.CPUPlace()
-        x_tensor = fluid.create_lod_tensor(
-            np.random.rand(8, 4).astype("float32"), [[4, 4]], place
-        )
-
-        x_pad_tensor = np.random.rand(2, 4, 4).astype("float32")
-        x_pad_len_tensor = np.array([[4], [4]]).reshape([2, 1]).astype("int64")
-
-        exe = fluid.Executor(place)
-
-        exe.run(fluid.default_startup_program())
-        ret = exe.run(
-            feed={
-                'x': x_tensor,
-                'x_pad': x_pad_tensor,
-                'x_pad_len': x_pad_len_tensor,
-            },
-            fetch_list=[y, y_pad, y_pad_len],
-            return_numpy=False,
-        )
-
-
-class BadInputTestCTCAlignr(unittest.TestCase):
-    def test_error(self):
-        with fluid.program_guard(fluid.Program()):
-
-            def test_bad_x():
-                x = fluid.layers.data(name='x', shape=[8], dtype='int64')
-                cost = fluid.layers.ctc_greedy_decoder(input=x, blank=0)
-
-            self.assertRaises(TypeError, test_bad_x)
-
-
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 64e671c8ee9bd..62def4247037f 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1519,8 +1519,8 @@ def test_topk(self):
         with self.dynamic_graph():
             with _test_eager_guard():
                 input = fluid.dygraph.to_variable(np.random.random((13, 11)))
-                top5_values1, top5_indices1 = layers.topk(input, k=5)
-                top5_values2, top5_indices2 = layers.topk(
+                top5_values1, top5_indices1 = paddle.topk(input, k=5)
+                top5_values2, top5_indices2 = paddle.topk(
                     input, k=fluid.dygraph.to_variable(np.array([5]))
                 )
                 np.testing.assert_array_equal(
@@ -1531,8 +1531,8 @@ def test_topk(self):
                 )
 
             input = fluid.dygraph.to_variable(np.random.random((13, 11)))
-            top5_values1, top5_indices1 = layers.topk(input, k=5)
-            top5_values2, top5_indices2 = layers.topk(
+            top5_values1, top5_indices1 = paddle.topk(input, k=5)
+            top5_values2, top5_indices2 = paddle.topk(
                 input, k=fluid.dygraph.to_variable(np.array([5]))
             )
             np.testing.assert_array_equal(
@@ -3104,7 +3104,7 @@ def make_multiplex(self):
             x1 = self._get_data(name='x1', shape=[4], dtype='float32')
             x2 = self._get_data(name='x2', shape=[4], dtype='float32')
             index = self._get_data(name='index', shape=[1], dtype='int32')
-            out = layers.multiplex(inputs=[x1, x2], index=index)
+            out = paddle.multiplex(inputs=[x1, x2], index=index)
             return out
 
     def make_softmax_with_cross_entropy(self):
@@ -3144,15 +3144,6 @@ def make_softmax_with_cross_entropy(self):
             self.assertIsNotNone(loss4)
             return loss4
 
-    def make_smooth_l1(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            x = self._get_data(name='x', shape=[4], dtype='float32')
-            y = self._get_data(name='label', shape=[4], dtype='float32')
-            loss = layers.smooth_l1(x, y)
-            return loss
-
     def make_scatter(self):
         with program_guard(
             fluid.default_main_program(), fluid.default_startup_program()
@@ -3192,7 +3183,7 @@ def make_topk(self):
             fluid.default_main_program(), fluid.default_startup_program()
         ):
             data = self._get_data(name="label", shape=[200], dtype="float32")
-            values, indices = layers.topk(data, k=5)
+            values, indices = paddle.topk(data, k=5)
             return values
             return indices
 
@@ -3559,20 +3550,6 @@ def test_dynamic_lstmp(self):
                 )
             )
 
-    def test_im2sequence(self):
-        # TODO(minqiyang): dygraph do not support lod now
-        with self.static_graph():
-            x = layers.data(name='x', shape=[3, 128, 128], dtype='float32')
-            y = layers.data(name='y', shape=[], dtype='float32')
-            output = layers.im2sequence(
-                input=x,
-                input_image_size=y,
-                stride=[1, 1],
-                filter_size=[2, 2],
-                out_stride=[1, 1],
-            )
-            return output
-
     def test_lod_reset(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():
diff --git a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
index ba251c05ac69e..6d259617c0248 100644
--- a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
@@ -17,8 +17,6 @@
 import numpy as np
 from op_test import OpTest
 
-import paddle.fluid as fluid
-
 
 def smooth_l1_loss_forward(val, sigma2):
     abs_val = abs(val)
@@ -124,22 +122,5 @@ def test_check_grad_ingore_y(self):
         )
 
 
-class TestSmoothL1LossOpError(unittest.TestCase):
-    def test_errors(self):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            # The input type of accuracy_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace()
-            )
-            y1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace()
-            )
-            self.assertRaises(TypeError, fluid.layers.smooth_l1, x1, y1)
-            # The input dtype of accuracy_op must be float32 or float64.
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
-            y2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
-            self.assertRaises(TypeError, fluid.layers.smooth_l1, x2, y2)
-
-
 if __name__ == '__main__':
     unittest.main()

From 2cb07a1ff21ff4a5d7f01e90520aeba7974a8def Mon Sep 17 00:00:00 2001
From: heyanru <81976792+heyanru01@users.noreply.github.com>
Date: Mon, 5 Dec 2022 14:02:30 +0800
Subject: [PATCH 08/13] [Fluid Clean] remove
 fluid.layers.continuous_value_model (#48509)

---
 python/paddle/fluid/layers/nn.py    | 58 -----------------------------
 python/paddle/static/nn/__init__.py |  1 -
 2 files changed, 59 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index a4125088c8a2f..56765c19cba65 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -102,7 +102,6 @@
     'bilinear_tensor_product',
     'merge_selected_rows',
     'get_tensor_from_selected_rows',
-    'continuous_value_model',
     'unfold',
     'deformable_roi_pooling',
     'shard_index',
@@ -5563,63 +5562,6 @@ def get_tensor_from_selected_rows(x, name=None):
     return out
 
 
-def continuous_value_model(input, cvm, use_cvm=True):
-    r"""
-
-    **continuous_value_model layers**
-
-    Now, this OP is used in CTR project to remove or dispose show and click value in :attr:`input`.
-
-    :attr:`input` is an embedding vector including show and click value, whose shape is :math:`[N, D]` (N is batch size. D is `2 + embedding dim` ).
-    Show and click at first two dims of embedding vector D.
-    If :attr:`use_cvm` is True, it will calculate :math:`log(show)` and :math:`log(click)` , and output shape is :math:`[N, D]` .
-    If :attr:`use_cvm` is False, it will remove show and click from :attr:`input` , and output shape is :math:`[N, D - 2]` .
-    :attr:`cvm` is show_click info, whose shape is :math:`[N, 2]` .
-
-    Args:
-        input (Variable): The input variable. A 2-D LoDTensor with shape :math:`[N, D]` , where N is the batch size, D is `2 + the embedding dim` . `lod level = 1` .
-        A Tensor with type float32, float64.
-        cvm (Variable): Show and click variable. A 2-D Tensor with shape :math:`[N, 2]` , where N is the batch size, 2 is show and click.
-        A Tensor with type float32, float64.
-        use_cvm  (bool):  Use show_click or not. if use, the output dim is the same as input.
-                          if not use, the output dim is `input dim - 2` (remove show and click)
-
-    Returns:
-
-        Variable: A 2-D LodTensor with shape :math:`[N, M]` . if :attr:`use_cvm` = True, M is equal to input dim D. if False, M is equal to `D - 2`. \
-        A Tensor with same type as input.
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          input = fluid.data(name="input", shape=[64, 1], dtype="int64")
-          label = fluid.data(name="label", shape=[64, 1], dtype="int64")
-          embed = fluid.layers.embedding(
-                            input=input,
-                            size=[100, 11],
-                            dtype='float32')
-          ones = fluid.layers.fill_constant_batch_size_like(input=label, shape=[-1, 1], dtype="int64", value=1)
-          show_clk = fluid.layers.cast(fluid.layers.concat([ones, label], axis=1), dtype='float32')
-          show_clk.stop_gradient = True
-          input_with_cvm = fluid.layers.continuous_value_model(embed, show_clk, True)
-
-    """
-    helper = LayerHelper('cvm', **locals())
-    out = helper.create_variable(dtype=input.dtype)
-    check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64'], 'cvm'
-    )
-    helper.append_op(
-        type='cvm',
-        inputs={'X': [input], 'CVM': [cvm]},
-        outputs={'Y': [out]},
-        attrs={"use_cvm": use_cvm},
-    )
-    return out
-
-
 def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
     r"""
 
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index 3d3cc5f8a2bb8..1849cfd395a55 100755
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -40,7 +40,6 @@
 
 from ...fluid.input import embedding  # noqa: F401
 from ...fluid.contrib.layers import sparse_embedding  # noqa: F401
-from ...fluid.layers import continuous_value_model  # noqa: F401
 from ...fluid.layers import StaticRNN  # noqa: F401
 
 from ...fluid.layers.sequence_lod import sequence_conv  # noqa: F401

From 0c1d68e1a55b2f206172ebe9483dfe75436d4ca9 Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Mon, 5 Dec 2022 14:09:55 +0800
Subject: [PATCH 09/13] fix custom operator backward=None (#48656)

---
 .../custom_operator/custom_operator_node.cc   | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index 1f0a055cbd386..5c3c2fbe7e9c6 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -217,18 +217,20 @@ RunCustomOpNode::operator()(
   VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
   for (size_t i = 0; i < OutputMeta().size(); i++) {
     if (map[0][0].find(i) != map[0][0].end()) {
+      int grad_output_idx = map[0][0][i];
       VLOG(7) << "Insert grad outputs: " << i
-              << " with size: " << OutputMeta()[i].size()
-              << " to tmp_outputs: " << map[0][0][i];
-      for (size_t j = 0; j < OutputMeta()[i].size(); j++) {
-        outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
-                             std::make_shared<phi::DenseTensor>(
-                                 phi::DataType::UNDEFINED),
-                             egr::Controller::Instance().GenerateUniqueName(
-                                 "custom_tmp_grad"));
-        egr::EagerUtils::autograd_meta(&(outs[i][j]));
+              << " with size: " << OutputMeta()[grad_output_idx].size()
+              << " to tmp_outputs: " << grad_output_idx;
+      for (size_t j = 0; j < OutputMeta()[grad_output_idx].size(); j++) {
+        outs[grad_output_idx]
+            .emplace_back(/* init it incase of copy nullptr of shared_ptr */
+                          std::make_shared<phi::DenseTensor>(
+                              phi::DataType::UNDEFINED),
+                          egr::Controller::Instance().GenerateUniqueName(
+                              "custom_tmp_grad"));
+        egr::EagerUtils::autograd_meta(&(outs[grad_output_idx][j]));
       }
-      tmp_outs[map[0][0][i]] = outs[i];
+      tmp_outs[grad_output_idx] = outs[grad_output_idx];
     }
   }
   for (size_t i = 0; i < tmp_outs.size(); i++) {

From 9913da022d091fb652bf2c472c8209a70cc3e947 Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Mon, 5 Dec 2022 14:21:47 +0800
Subject: [PATCH 10/13] Setuptools (#48301)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* test

* test

* test

* test

* test

* suport setuptools for paddle

* modify paddle_build.sh

* modify paddle_build.sh

* modify paddle_build.sh

* modify paddle_build.sh

* modify paddle_build.sh

* test

* modify setup.py

* modify build_options

* modify build_options

* modify paddle_build.sh

* modify setup.py

* modify paddle_build.sh

* modify setup.py

* modify setup.py

* modify setup.py

* modify setup.py

* modfiy paddle_build.sh

* debug

* debug

* debug

* dddd

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* fix bug that no version.py

* debug

* debug

* debug

* debug

* debug

* debug

* Delete .pre-commit-config.yaml

* debug

* support ninja

* support ninja

* debug

* debug

* debug

* support setuptools for paddle

* modify code style

* debug

* debug

* debug

* debug

* 取消make clean

* 取消make clean

* debug

* debug

* debug

* debug for py3

* debug

* debug

* debug

* 将mkdir_and_copy_file单独封装一个函数

* modify paddle_build.sh

* modify setup.py after zhangbo reviewd
---
 .pre-commit-config.yaml        |    0
 paddle/scripts/paddle_build.sh |  227 +++++-
 python/CMakeLists.txt          |   75 +-
 python/env_dict.py.in          |   69 ++
 setup.py                       | 1380 ++++++++++++++++++++++++++++++++
 5 files changed, 1725 insertions(+), 26 deletions(-)
 mode change 100755 => 100644 .pre-commit-config.yaml
 create mode 100644 python/env_dict.py.in
 create mode 100644 setup.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
old mode 100755
new mode 100644
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 1153f636136d1..4e563496d3529 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -768,7 +768,9 @@ function run_linux_cpu_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     pip install hypothesis
-    pip install ${PADDLE_ROOT}/build/python/dist/*whl
+    if [ -d "${PADDLE_ROOT}/build/python/dist/" ]; then
+        pip install ${PADDLE_ROOT}/build/python/dist/*whl
+    fi
     cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python
     cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/testsuite.py ${PADDLE_ROOT}/build/python
     cp -r ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/white_list ${PADDLE_ROOT}/build/python
@@ -917,6 +919,7 @@ set +x
 set -ex
     fi
 }
+
 function get_precision_ut_mac() {
     on_precision=0
     UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
@@ -1050,7 +1053,9 @@ function generate_api_spec() {
     else
         pip install -r ${PADDLE_ROOT}/python/requirements.txt
     fi
-    pip --no-cache-dir install ${PADDLE_ROOT}/build/python/dist/*whl
+    if [ -d "${PADDLE_ROOT}/build/python/dist/" ]; then
+        pip --no-cache-dir install ${PADDLE_ROOT}/build/python/dist/*whl
+    fi
     spec_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.spec
     python ${PADDLE_ROOT}/tools/print_signatures.py paddle > $spec_path
 
@@ -2863,7 +2868,9 @@ function parallel_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     pip install hypothesis
-    pip install ${PADDLE_ROOT}/build/python/dist/*whl
+    if [ -d "${PADDLE_ROOT}/build/python/dist/" ]; then
+        pip install ${PADDLE_ROOT}/build/python/dist/*whl
+    fi
     cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/testsuite.py ${PADDLE_ROOT}/build/python
     cp -r ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/white_list ${PADDLE_ROOT}/build/python
     ut_total_startTime_s=`date +%s`
@@ -3454,7 +3461,219 @@ function check_coverage_build() {
     fi
     set -x
 }
+function run_setup(){
+    rm -rf ${PADDLE_ROOT}/build
+    startTime_s=`date +%s`
 
+    SYSTEM=`uname -s`
+    if [ "$SYSTEM" == "Darwin" ]; then
+        echo "Using python abi: $1"
+        if [ "$1" == "cp36-cp36m" ] || [ "$1" == "" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.6" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.6/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/:${PATH}
+                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
+                export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
+                export PYTHON_INCLUDE_DIR=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
+                export PYTHON_LIBRARY=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib
+                pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt
+            else
+                exit 1
+            fi
+        elif [ "$1" == "cp37-cp37m" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.7" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.7/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/:${PATH}
+                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
+                export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
+                export PYTHON_INCLUDE_DIR=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
+                export PYTHON_LIBRARY=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib
+
+                pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt
+            else
+                exit 1
+            fi
+        elif [ "$1" == "cp38-cp38" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.8" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.8/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/:${PATH}
+                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
+                export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.8/bin/python3
+                export PYTHON_INCLUDE_DIR=/Library/Frameworks/Python.framework/Versions/3.8/include/python3.8/
+                export PYTHON_LIBRARY=/Library/Frameworks/Python.framework/Versions/3.8/lib/libpython3.8.dylib
+                pip3.8 install --user -r ${PADDLE_ROOT}/python/requirements.txt
+            else
+                exit 1
+            fi
+        elif [ "$1" == "cp39-cp39" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.9" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.9/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.9/bin/:${PATH}
+                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
+                export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.9/bin/python3
+                export PYTHON_INCLUDE_DIR=/Library/Frameworks/Python.framework/Versions/3.9/include/python3.9/
+                export PYTHON_LIBRARY=/Library/Frameworks/Python.framework/Versions/3.9/lib/libpython3.9.dylib
+                pip3.9 install --user -r ${PADDLE_ROOT}/python/requirements.txt
+            else
+                exit 1
+            fi
+        elif [ "$1" == "cp310-cp310" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.10" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.10/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.10/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.10/bin/:${PATH}
+                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
+                export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.9/lib/libpython3.9.dylib
+                export PYTHON_INCLUDE_DIR=/Library/Frameworks/Python.framework/Versions/3.10/include/python3.10/
+                export PYTHON_LIBRARY=/Library/Frameworks/Python.framework/Versions/3.10/lib/libpython3.10.dylib
+                pip3.10 install --user -r ${PADDLE_ROOT}/python/requirements.txt
+            else
+                exit 1
+            fi
+        fi
+    else
+        if [ "$1" != "" ]; then
+            echo "using python abi: $1"
+            if [ "$1" == "cp36-cp36m" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH}
+                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
+                export PYTHON_EXECUTABLE=/opt/_internal/cpython-3.6.0/bin/python3
+                export PYTHON_INCLUDE_DIR=/opt/_internal/cpython-3.6.0/include/python3.6m
+                export PYTHON_LIBRARIES=/opt/_internal/cpython-3.6.0/lib/libpython3.so
+                pip3.6 install -r ${PADDLE_ROOT}/python/requirements.txt
+            elif [ "$1" == "cp37-cp37m" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH}
+                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
+                export PYTHON_EXECUTABLE=/opt/_internal/cpython-3.7.0/bin/python3.7
+                export PYTHON_INCLUDE_DIR=/opt/_internal/cpython-3.7.0/include/python3.7m
+                export PYTHON_LIBRARIES=/opt/_internal/cpython-3.7.0/lib/libpython3.so
+                pip3.7 install -r ${PADDLE_ROOT}/python/requirements.txt
+            elif [ "$1" == "cp38-cp38" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.8.0/bin/:${PATH}
+                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
+                export PYTHON_EXECUTABLE=/opt/_internal/cpython-3.8.0/bin/python3.8
+                export PYTHON_INCLUDE_DIR=/opt/_internal/cpython-3.8.0/include/python3.8
+                export PYTHON_LIBRARIES=/opt/_internal/cpython-3.8.0/lib/libpython3.so
+                pip3.8 install -r ${PADDLE_ROOT}/python/requirements.txt
+            elif [ "$1" == "cp39-cp39" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.9.0/bin/:${PATH}
+                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
+                export PYTHON_EXECUTABLE=/opt/_internal/cpython-3.9.0/bin/python3.9
+                export PYTHON_INCLUDE_DIR=/opt/_internal/cpython-3.9.0/include/python3.9
+                export PYTHON_LIBRARIES=/opt/_internal/cpython-3.9.0/lib/libpython3.so
+                pip3.9 install -r ${PADDLE_ROOT}/python/requirements.txt
+            elif [ "$1" == "cp310-cp310" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.10.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.10.0/bin/:${PATH}
+                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
+                export PYTHON_EXECUTABLE=/opt/_internal/cpython-3.10.0/bin/python3.10
+                export PYTHON_INCLUDE_DIR=/opt/_internal/cpython-3.10.0/include/python3.10
+                export PYTHON_LIBRARIES=/opt/_internal/cpython-3.10.0/lib/libpython3.so
+                pip3.10 install -r ${PADDLE_ROOT}/python/requirements.txt
+           elif [ "$1" == "conda-python3.7" ]; then
+                export LD_LIBRARY_PATH=/opt/conda/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/conda/bin/:${PATH}
+                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
+                export DPYTHON_EXECUTABLE=/opt/conda/bin/python
+                export PYTHON_INCLUDE_DIR=/opt/conda/include/python3.7m
+                export PYTHON_LIBRARIES=/opt/conda/lib/libpython3.so
+                /opt/conda/bin/pip install -r ${PADDLE_ROOT}/python/requirements.txt
+           fi
+        else
+            pip install -r ${PADDLE_ROOT}/python/requirements.txt
+        fi
+    fi
+
+    if [ "$SYSTEM" == "Darwin" ]; then
+        WITH_DISTRIBUTE="OFF"
+        WITH_AVX=${WITH_AVX:-ON}
+        WITH_ARM=${WITH_ARM:-OFF}
+        INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-~/.cache/inference_demo}
+    else
+        INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
+    fi
+
+    distibuted_flag=${WITH_DISTRIBUTE:-OFF}
+    gloo_flag=${distibuted_flag}
+
+    if [ "$CMD" != "assert_file_approvals" ];then
+      which python
+      python -V
+      python -m pip install distro
+      python ${PADDLE_ROOT}/tools/summary_env.py
+      bash ${PADDLE_ROOT}/tools/get_cpu_info.sh
+    fi
+    export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
+    export WITH_GPU=${WITH_GPU:-OFF}
+    export WITH_TENSORRT=${WITH_TENSORRT:-ON}
+    export WITH_ROCM=${WITH_ROCM:-OFF}
+    export WITH_CINN=${WITH_CINN:-OFF}
+    export WITH_DISTRIBUTE=${distibuted_flag}
+    export WITH_MKL=${WITH_MKL:-ON}
+    export WITH_AVX=${WITH_AVX:-OFF}
+    export CUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
+    export NEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} 
+    export NEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF}
+    export NEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF}
+    export WITH_PYTHON=${WITH_PYTHON:-ON}
+    export CUDNN_ROOT=/usr/
+    export WITH_TESTING=${WITH_TESTING:-ON}
+    export WITH_COVERAGE=${WITH_COVERAGE:-OFF}
+    export WITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF}
+    export CMAKE_MODULE_PATH=/opt/rocm/hip/cmake
+    export CMAKE_EXPORT_COMPILE_COMMANDS=ON
+    export WITH_CONTRIB=${WITH_CONTRIB:-ON}
+    export WITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
+    export WITH_INFRT=${WITH_INFRT:-OFF}
+    export INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
+    export PY_VERSION=${PY_VERSION:-3.7}
+    export CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
+    export WITH_PSCORE=${distibuted_flag}
+    export WITH_PSLIB=${WITH_PSLIB:-OFF}
+    export WITH_GLOO=${gloo_flag}
+    export LITE_GIT_TAG=release/v2.10
+    export WITH_XPU=${WITH_XPU:-OFF}
+    export WITH_MLU=${WITH_MLU:-OFF}
+    export WITH_IPU=${WITH_IPU:-OFF}
+    export WITH_CNCL=${WITH_CNCL:-OFF}
+    export XPU_SDK_ROOT=${XPU_SDK_ROOT:-}
+    export WITH_LITE=${WITH_LITE:-OFF}
+    export WITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF}
+    export WITH_ARM=${WITH_ARM:-OFF}
+    export WITH_ASCEND=${WITH_ASCEND:-OFF}
+    export WITH_ASCEND_CL=${WITH_ASCEND_CL:-OFF}
+    export WITH_ASCEND_INT64=${WITH_ASCEND_INT64:-OFF}
+    export WITH_STRIP=${WITH_STRIP:-ON}
+    export ON_INFER=${ON_INFER:-OFF}
+    export WITH_HETERPS=${WITH_HETERPS:-OFF}
+    export WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
+    export CUDA_ARCH_BIN=${CUDA_ARCH_BIN}
+    export WITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF}
+    export WITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF}
+    export WITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF}
+    export WITH_CUDNN_FRONTEND=${WITH_CUDNN_FRONTEND:-OFF}
+
+    # reset ccache zero stats for collect PR's actual hit rate
+    ccache -z
+
+    python setup.py install;build_error=$?
+    
+    # ci will collect ccache hit rate
+    collect_ccache_hits
+
+    if [ "$build_error" != 0 ];then
+        exit 7;
+    fi
+
+}
 function main() {
     local CMD=$1
     local parallel_number=$2
@@ -3678,7 +3897,7 @@ function main() {
         parallel_test
         ;;
       build_gpubox)
-        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        run_setup ${PYTHON_ABI:-""} 
         ;;
       check_xpu)
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 3a3c98a9e9956..3c6ac0229d58d 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -22,8 +22,13 @@ set(SETUP_LOG_FILE "setup.py.log")
 
 set(FLUID_CORE_NAME "libpaddle")
 
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
-               ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+if(WITH_SETUP_INSTALL)
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/env_dict.py.in
+                 ${CMAKE_CURRENT_BINARY_DIR}/env_dict.py)
+else()
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+                 ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+endif()
 
 set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
 
@@ -59,25 +64,48 @@ if(WITH_TESTING)
 endif()
 
 if(WIN32)
-  add_custom_command(
-    OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-    COMMAND
-      ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle
-      ${PADDLE_BINARY_DIR}/python/paddle/
-    COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py
-            bdist_wheel
-    COMMENT "Packing whl packages------>>>"
-    DEPENDS copy_libpaddle ${FLUID_CORE} framework_py_proto profiler_py_proto
-            pass_desc_py_proto ${PY_FILES})
+  if(WITH_SETUP_INSTALL)
+    add_custom_command(
+      OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+      COMMAND
+        ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle
+        ${PADDLE_BINARY_DIR}/python/paddle/
+      COMMENT "Packing whl packages------>>>"
+      DEPENDS copy_libpaddle ${FLUID_CORE} framework_py_proto profiler_py_proto
+              pass_desc_py_proto ${PY_FILES})
+  else()
+    add_custom_command(
+      OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+      COMMAND
+        ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle
+        ${PADDLE_BINARY_DIR}/python/paddle/
+      COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py
+              bdist_wheel
+      COMMENT "Packing whl packages------>>>"
+      DEPENDS copy_libpaddle ${FLUID_CORE} framework_py_proto profiler_py_proto
+              pass_desc_py_proto ${PY_FILES})
+  endif()
 else()
-  add_custom_command(
-    OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-    COMMAND touch stub.cc
-    COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
-    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-    COMMENT "Packing whl packages------>>>"
-    DEPENDS copy_libpaddle ${FLUID_CORE} framework_py_proto profiler_py_proto
-            pass_desc_py_proto ${PY_FILES})
+  if(WITH_SETUP_INSTALL)
+    add_custom_command(
+      OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+      COMMAND touch stub.cc
+      COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle
+              ${PADDLE_BINARY_DIR}/python
+      COMMENT "Packing whl packages------>>>"
+      DEPENDS copy_libpaddle ${FLUID_CORE} framework_py_proto profiler_py_proto
+              pass_desc_py_proto ${PY_FILES})
+  else()
+    add_custom_command(
+      OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+      COMMAND touch stub.cc
+      COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle
+              ${PADDLE_BINARY_DIR}/python
+      COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+      COMMENT "Packing whl packages------>>>"
+      DEPENDS copy_libpaddle ${FLUID_CORE} framework_py_proto profiler_py_proto
+              pass_desc_py_proto ${PY_FILES})
+  endif()
 endif()
 
 add_custom_target(paddle_python ALL
@@ -93,8 +121,11 @@ if(WITH_TESTING)
   add_subdirectory(paddle/fluid/contrib/tests)
   add_subdirectory(paddle/fluid/contrib/slim/tests)
 endif()
-install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
-        DESTINATION opt/paddle/share/wheels)
+
+if(NOT WITH_SETUP_INSTALL)
+  install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
+          DESTINATION opt/paddle/share/wheels)
+endif()
 
 if(APPLE)
   find_program(INSTALL_NAME_TOOL_EXECUTABLE install_name_tool)
diff --git a/python/env_dict.py.in b/python/env_dict.py.in
new file mode 100644
index 0000000000000..5b479c7ae4c97
--- /dev/null
+++ b/python/env_dict.py.in
@@ -0,0 +1,69 @@
+env_dict={
+    'PADDLE_SOURCE_DIR':'@PADDLE_SOURCE_DIR@',
+    'PADDLE_VERSION':'@PADDLE_VERSION@',
+    'PADDLE_BINARY_DIR':'@PADDLE_BINARY_DIR@',
+    'TAG_VERSION_REGEX':'@TAG_VERSION_REGEX@',
+    'WITH_GPU':'@WITH_GPU@',
+    'CUDNN_MAJOR_VERSION':'@CUDNN_MAJOR_VERSION@',
+    'CUDNN_MINOR_VERSION':'@CUDNN_MINOR_VERSION@',
+    'CUDNN_PATCHLEVEL_VERSION':'@CUDNN_PATCHLEVEL_VERSION@',
+    'CUDA_VERSION':'@CUDA_VERSION@',
+    'WITH_PSLI':'@WITH_PSLI@',
+    'FLUID_CORE_NAME':'@FLUID_CORE_NAME@',
+    'WARPCTC_LIBRARIES':'@WARPCTC_LIBRARIES@',
+    'LAPACK_LIB':'@LAPACK_LIB@',
+    'GFORTRAN_LIB':'@GFORTRAN_LIB@',
+    'GNU_RT_LIB_1':'@GNU_RT_LIB_1@',
+    'WITH_CUDNN_DSO':'@WITH_CUDNN_DSO@',
+    'CUDNN_LIBRARY':'@CUDNN_LIBRARY@',
+    'GNU_RT_LIB_2':'@GNU_RT_LIB_2@',
+    'WITH_MKL':'@WITH_MKL@',
+    'MKLML_SHARED_LIB':'@MKLML_SHARED_LIB@',
+    'MKLML_SHARED_IOMP_LIB':'@MKLML_SHARED_IOMP_LIB@',
+    'OPENBLAS_SHARED_LIB':'@OPENBLAS_SHARED_LIB@',
+    'OPENBLAS_LIB':'@OPENBLAS_LIB@',
+    'BLAS_LIB':'@BLAS_LIB@',
+    'WITH_LITE':'@WITH_LITE@',
+    'LITE_SHARED_LIB':'@LITE_SHARED_LIB@',
+    'LITE_WITH_NNADAPTER':'@LITE_WITH_NNADAPTER@',
+    'LITE_NNADAPTER_LIB':'@LITE_NNADAPTER_LIB@',
+    'NNADAPTER_WITH_HUAWEI_ASCEND_NPU':'@NNADAPTER_WITH_HUAWEI_ASCEND_NPU@',
+    'LITE_NNADAPTER_NPU_LIB':'@LITE_NNADAPTER_NPU_LIB@',
+    'WITH_CINN':'@WITH_CINN@',
+    'CINN_LIB_LOCATION':'@CINN_LIB_LOCATION@',
+    'CINN_LIB_NAME':'@CINN_LIB_NAME@',
+    'CINN_INCLUDE_DIR':'@CINN_INCLUDE_DIR@',
+    'CMAKE_BUILD_TYPE':'@CMAKE_BUILD_TYPE@',
+    'PSLIB_LIB':'@PSLIB_LIB@',
+    'PSLIB_VERSION_PY':'@PSLIB_VERSION_PY@',
+    'WITH_MKLDNN':'@WITH_MKLDNN@',
+    'MKLDNN_SHARED_LIB':'@MKLDNN_SHARED_LIB@',
+    'MKLDNN_SHARED_LIB_1':'@MKLDNN_SHARED_LIB_1@',
+    'MKLDNN_SHARED_LIB_2':'@MKLDNN_SHARED_LIB_2@',
+    'MKLDNN_INSTALL_DIR':'@MKLDNN_INSTALL_DIR@',
+    'WITH_ONNXRUNTIME':'@WITH_ONNXRUNTIME@',
+    'ONNXRUNTIME_SHARED_LIB':'@ONNXRUNTIME_SHARED_LIB@',
+    'PADDLE2ONNX_LIB':'@PADDLE2ONNX_LIB@',
+    'PADDLE2ONNX_LIB_NAME':'@PADDLE2ONNX_LIB_NAME@',
+    'ONNXRUNTIME_LIB_NAME':'@ONNXRUNTIME_LIB_NAME@',
+    'WITH_XPU':'@WITH_XPU@',
+    'XPU_API_LIB':'@XPU_API_LIB@',
+    'XPU_API_LIB_NAME':'@XPU_API_LIB_NAME@',
+    'XPU_RT_LIB':'@XPU_RT_LIB@',
+    'XPU_RT_LIB_NAME':'@XPU_RT_LIB_NAME@',
+    'WITH_XPU_BKCL':'@WITH_XPU_BKCL@',
+    'XPU_BKCL_LIB':'@XPU_BKCL_LIB@',
+    'XPU_BKCL_LIB_NAME':'@XPU_BKCL_LIB_NAME@',
+    'THIRD_PARTY_PATH':'@THIRD_PARTY_PATH@',
+    'SETUP_LOG_FILE':'@SETUP_LOG_FILE@',
+    'WITH_STRIP':'@WITH_STRIP@',
+    'PACKAGE_NAME':'@PACKAGE_NAME@',
+    'PADDLE_VERSION':'@PADDLE_VERSION@',
+    'APPLE':'@APPLE@',
+    'externalError_INCLUDE_DIR':'@externalError_INCLUDE_DIR@',
+    'WITH_ROCM':'@WITH_ROCM@',
+    'ORIGIN':'@ORIGIN@',
+    'WIN32':'@WIN32@',
+    'JIT_RELEASE_WHL':'@JIT_RELEASE_WHL@',
+    'WITH_PSLIB':'@WITH_PSLIB@'
+}
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000..58458c5470740
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,1380 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import errno
+import fnmatch
+import glob
+import multiprocessing
+import os
+import platform
+import re
+import shutil
+import subprocess
+import sys
+from contextlib import contextmanager
+from distutils.spawn import find_executable
+from subprocess import CalledProcessError
+
+from setuptools import Command, Distribution, Extension, setup
+from setuptools.command.egg_info import egg_info
+from setuptools.command.install import install as InstallCommandBase
+from setuptools.command.install_lib import install_lib
+from setuptools.dist import Distribution
+
+if sys.version_info < (3, 7):
+    raise RuntimeError(
+        "Paddle only supports Python version>=3.7 now, you are using Python %s"
+        % platform.python_version()
+    )
+else:
+    if os.getenv("PY_VERSION") is None:
+        print("export PY_VERSION = %s" % platform.python_version())
+        python_version = platform.python_version()
+        os.environ["PY_VERSION"] = python_version
+
+# check cmake
+CMAKE = find_executable('cmake3') or find_executable('cmake')
+assert (
+    CMAKE
+), 'The "cmake" executable is not found. Please check if Cmake is installed.'
+
+TOP_DIR = os.path.dirname(os.path.realpath(__file__))
+
+IS_WINDOWS = os.name == 'nt'
+
+
+def filter_setup_args(input_args):
+    cmake_and_build = True
+    only_cmake = False
+    rerun_cmake = False
+    filter_args_list = []
+    for arg in input_args:
+        if arg == 'rerun-cmake':
+            rerun_cmake = True  # delete Cmakecache.txt and rerun cmake
+            continue
+        if arg == 'only-cmake':
+            only_cmake = True  # only cmake and do not make, leave a chance for users to adjust build options
+            continue
+        if arg in ['clean', 'egg_info', 'sdist']:
+            cmake_and_build = False
+        filter_args_list.append(arg)
+    return cmake_and_build, only_cmake, rerun_cmake, filter_args_list
+
+
+cmake_and_build, only_cmake, rerun_cmake, filter_args_list = filter_setup_args(
+    sys.argv
+)
+
+
+def parse_input_command(input_parameters):
+    dist = Distribution()
+    # get script name :setup.py
+    sys.argv = input_parameters
+    dist.script_name = os.path.basename(sys.argv[0])
+    # get args of setup.py
+    dist.script_args = sys.argv[1:]
+    print(
+        "Start executing python {} {}".format(
+            dist.script_name, "".join(dist.script_args)
+        )
+    )
+    try:
+        dist.parse_command_line()
+    except:
+        print(
+            "An error occurred while parsing the parameters, '%s'"
+            % dist.script_args
+        )
+        sys.exit(1)
+
+
+class BinaryDistribution(Distribution):
+    def has_ext_modules(foo):
+        return True
+
+
+RC = 0
+ext_suffix = (
+    '.dll'
+    if os.name == 'nt'
+    else ('.dylib' if sys.platform == 'darwin' else '.so')
+)
+
+
+def get_header_install_dir(header):
+    if 'pb.h' in header:
+        install_dir = re.sub(
+            env_dict.get("PADDLE_BINARY_DIR") + '/', '', header
+        )
+    elif 'third_party' not in header:
+        # paddle headers
+        install_dir = re.sub(
+            env_dict.get("PADDLE_SOURCE_DIR") + '/', '', header
+        )
+        print('install_dir: ', install_dir)
+        if 'fluid/jit' in install_dir:
+            install_dir = re.sub('fluid/jit', 'jit', install_dir)
+            print('fluid/jit install_dir: ', install_dir)
+        if 'trace_event.h' in install_dir:
+            install_dir = re.sub(
+                'fluid/platform/profiler',
+                'phi/backends/custom',
+                install_dir,
+            )
+            print('trace_event.h install_dir: ', install_dir)
+    else:
+        # third_party
+        install_dir = re.sub(
+            env_dict.get("THIRD_PARTY_PATH") + '/', 'third_party', header
+        )
+        patterns = ['install/mkldnn/include']
+        for pattern in patterns:
+            install_dir = re.sub(pattern, '', install_dir)
+    return install_dir
+
+
+class InstallHeaders(Command):
+    """Override how headers are copied."""
+
+    description = 'install C/C++ header files'
+
+    user_options = [
+        ('install-dir=', 'd', 'directory to install header files to'),
+        ('force', 'f', 'force installation (overwrite existing files)'),
+    ]
+
+    boolean_options = ['force']
+
+    def initialize_options(self):
+        self.install_dir = None
+        self.force = 0
+        self.outfiles = []
+
+    def finalize_options(self):
+        self.set_undefined_options(
+            'install', ('install_headers', 'install_dir'), ('force', 'force')
+        )
+
+    def run(self):
+        hdrs = self.distribution.headers
+        if not hdrs:
+            return
+        self.mkpath(self.install_dir)
+        for header in hdrs:
+            install_dir = get_header_install_dir(header)
+            install_dir = os.path.join(
+                self.install_dir, os.path.dirname(install_dir)
+            )
+            if not os.path.exists(install_dir):
+                self.mkpath(install_dir)
+            (out, _) = self.copy_file(header, install_dir)
+            self.outfiles.append(out)
+            # (out, _) = self.mkdir_and_copy_file(header)
+            # self.outfiles.append(out)
+
+    def get_inputs(self):
+        return self.distribution.headers or []
+
+    def get_outputs(self):
+        return self.outfiles
+
+
+class InstallCommand(InstallCommandBase):
+    def finalize_options(self):
+
+        ret = InstallCommandBase.finalize_options(self)
+        self.install_lib = self.install_platlib
+        print("install_lib:", self.install_platlib)
+
+        self.install_headers = os.path.join(
+            self.install_platlib, 'paddle', 'include'
+        )
+        print("install_headers:", self.install_headers)
+        return ret
+
+
+class EggInfo(egg_info):
+    """Copy license file into `.dist-info` folder."""
+
+    def run(self):
+        # don't duplicate license into `.dist-info` when building a distribution
+        if not self.distribution.have_run.get('install', True):
+            self.mkpath(self.egg_info)
+            self.copy_file(
+                env_dict.get("PADDLE_SOURCE_DIR") + "/LICENSE", self.egg_info
+            )
+
+        egg_info.run(self)
+
+
+# class Installlib is rewritten to add header files to .egg/paddle
+class InstallLib(install_lib):
+    def run(self):
+        self.build()
+        outfiles = self.install()
+        hrds = self.distribution.headers
+        if not hrds:
+            return
+        for header in hrds:
+            install_dir = get_header_install_dir(header)
+            install_dir = os.path.join(
+                self.install_dir, 'paddle/include', os.path.dirname(install_dir)
+            )
+            if not os.path.exists(install_dir):
+                self.mkpath(install_dir)
+            self.copy_file(header, install_dir)
+        if outfiles is not None:
+            # always compile, in case we have any extension stubs to deal with
+            self.byte_compile(outfiles)
+
+
+def git_commit():
+    try:
+        cmd = ['git', 'rev-parse', 'HEAD']
+        git_commit = (
+            subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                cwd=env_dict.get("PADDLE_SOURCE_DIR"),
+            )
+            .communicate()[0]
+            .strip()
+        )
+    except:
+        git_commit = 'Unknown'
+    git_commit = git_commit.decode('utf-8')
+    return str(git_commit)
+
+
+def _get_version_detail(idx):
+    assert (
+        idx < 3
+    ), "vesion info consists of %(major)d.%(minor)d.%(patch)d, \
+        so detail index must less than 3"
+    tag_version_regex = env_dict.get("TAG_VERSION_REGEX")
+    paddle_version = env_dict.get("PADDLE_VERSION")
+    if re.match(tag_version_regex, paddle_version):
+        version_details = paddle_version.split('.')
+        if len(version_details) >= 3:
+            return version_details[idx]
+    return 0
+
+
+def _mkdir_p(dir_str):
+    try:
+        os.makedirs(dir_str)
+    except OSError as e:
+        raise RuntimeError("Failed to create folder build/")
+
+
+def get_major():
+    return int(_get_version_detail(0))
+
+
+def get_minor():
+    return int(_get_version_detail(1))
+
+
+def get_patch():
+    return str(_get_version_detail(2))
+
+
+def get_cuda_version():
+    with_gpu = env_dict.get("WITH_GPU")
+    if with_gpu == 'ON':
+        return env_dict.get("CUDA_VERSION")
+    else:
+        return 'False'
+
+
+def get_cudnn_version():
+    with_gpu = env_dict.get("WITH_GPU")
+    if with_gpu == 'ON':
+        temp_cudnn_version = ''
+        cudnn_major_version = env_dict.get("CUDNN_MAJOR_VERSION")
+        if cudnn_major_version:
+            temp_cudnn_version += cudnn_major_version
+            cudnn_minor_version = env_dict.get("CUDNN_MINOR_VERSION")
+            if cudnn_minor_version:
+                temp_cudnn_version = (
+                    temp_cudnn_version + '.' + cudnn_minor_version
+                )
+                cudnn_patchlevel_version = env_dict.get(
+                    "CUDNN_PATCHLEVEL_VERSION"
+                )
+                if cudnn_patchlevel_version:
+                    temp_cudnn_version = (
+                        temp_cudnn_version + '.' + cudnn_patchlevel_version
+                    )
+        return temp_cudnn_version
+    else:
+        return 'False'
+
+
+def is_taged():
+    try:
+        cmd = [
+            'git',
+            'describe',
+            '--exact-match',
+            '--tags',
+            'HEAD',
+            '2>/dev/null',
+        ]
+        git_tag = (
+            subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                cwd=env_dict.get("PADDLE_SOURCE_DIR"),
+            )
+            .communicate()[0]
+            .strip()
+        )
+        git_tag = git_tag.decode()
+    except:
+        return False
+    if str(git_tag).replace('v', '') == env_dict.get("PADDLE_VERSION"):
+        return True
+    else:
+        return False
+
+
+def write_version_py(filename='paddle/version/__init__.py'):
+    cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
+#
+full_version    = '%(major)d.%(minor)d.%(patch)s'
+major           = '%(major)d'
+minor           = '%(minor)d'
+patch           = '%(patch)s'
+rc              = '%(rc)d'
+cuda_version    = '%(cuda)s'
+cudnn_version   = '%(cudnn)s'
+istaged         = %(istaged)s
+commit          = '%(commit)s'
+with_mkl        = '%(with_mkl)s'
+
+__all__ = ['cuda', 'cudnn', 'show']
+
+def show():
+    """Get the version of paddle if `paddle` package if tagged. Otherwise, output the corresponding commit id.
+
+    Returns:
+        If paddle package is not tagged, the commit-id of paddle will be output.
+        Otherwise, the following information will be output.
+
+        full_version: version of paddle
+
+        major: the major version of paddle
+
+        minor: the minor version of paddle
+
+        patch: the patch level version of paddle
+
+        rc: whether it's rc version
+
+        cuda: the cuda version of package. It will return `False` if CPU version paddle package is installed
+
+        cudnn: the cudnn version of package. It will return `False` if CPU version paddle package is installed
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            # Case 1: paddle is tagged with 2.2.0
+            paddle.version.show()
+            # full_version: 2.2.0
+            # major: 2
+            # minor: 2
+            # patch: 0
+            # rc: 0
+            # cuda: '10.2'
+            # cudnn: '7.6.5'
+
+            # Case 2: paddle is not tagged
+            paddle.version.show()
+            # commit: cfa357e984bfd2ffa16820e354020529df434f7d
+            # cuda: '10.2'
+            # cudnn: '7.6.5'
+    """
+    if istaged:
+        print('full_version:', full_version)
+        print('major:', major)
+        print('minor:', minor)
+        print('patch:', patch)
+        print('rc:', rc)
+    else:
+        print('commit:', commit)
+    print('cuda:', cuda_version)
+    print('cudnn:', cudnn_version)
+
+def mkl():
+    return with_mkl
+
+def cuda():
+    """Get cuda version of paddle package.
+
+    Returns:
+        string: Return the version information of cuda. If paddle package is CPU version, it will return False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.version.cuda()
+            # '10.2'
+
+    """
+    return cuda_version
+
+def cudnn():
+    """Get cudnn version of paddle package.
+
+    Returns:
+        string: Return the version information of cudnn. If paddle package is CPU version, it will return False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.version.cudnn()
+            # '7.6.5'
+
+    """
+    return cudnn_version
+'''
+    commit = git_commit()
+
+    dirname = os.path.dirname(filename)
+
+    try:
+        os.makedirs(dirname)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+    with open(filename, 'w') as f:
+        f.write(
+            cnt
+            % {
+                'major': get_major(),
+                'minor': get_minor(),
+                'patch': get_patch(),
+                'rc': RC,
+                'version': env_dict.get("PADDLE_VERSION"),
+                'cuda': get_cuda_version(),
+                'cudnn': get_cudnn_version(),
+                'commit': commit,
+                'istaged': is_taged(),
+                'with_mkl': env_dict.get("WITH_MKL"),
+            }
+        )
+
+
+def write_cuda_env_config_py(filename='paddle/cuda_env.py'):
+    cnt = ""
+    if env_dict.get("JIT_RELEASE_WHL") == 'ON':
+        cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
+#
+import os
+os.environ['CUDA_CACHE_MAXSIZE'] = '805306368'
+'''
+
+    with open(filename, 'w') as f:
+        f.write(cnt)
+
+
+def write_parameter_server_version_py(
+    filename='paddle/fluid/incubate/fleet/parameter_server/version.py',
+):
+    cnt = '''
+
+# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
+
+from paddle.fluid.incubate.fleet.base.mode import Mode
+
+BUILD_MODE=Mode.%(mode)s
+
+def is_transpiler():
+    return Mode.TRANSPILER == BUILD_MODE
+
+'''
+
+    dirname = os.path.dirname(filename)
+
+    try:
+        os.makedirs(dirname)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+    with open(filename, 'w') as f:
+        f.write(
+            cnt
+            % {
+                'mode': 'PSLIB'
+                if env_dict.get("WITH_PSLIB") == 'ON'
+                else 'TRANSPILER'
+            }
+        )
+
+
+def find_files(pattern, root, recursive=False):
+    for dirpath, _, files in os.walk(root):
+        for filename in fnmatch.filter(files, pattern):
+            yield os.path.join(dirpath, filename)
+        if not recursive:
+            break
+
+
+@contextmanager
+def cd(path):
+    if not os.path.isabs(path):
+        raise RuntimeError('Can only cd to absolute path, got: {}'.format(path))
+    orig_path = os.getcwd()
+    os.chdir(path)
+    try:
+        yield
+    finally:
+        os.chdir(orig_path)
+
+
+def options_process(args, build_options):
+    for key, value in sorted(build_options.items()):
+        if value is not None:
+            args.append("-D{}={}".format(key, value))
+
+
+def cmake_run(args, build_path):
+    with cd(build_path):
+        cmake_args = []
+        cmake_args.append(CMAKE)
+        cmake_args.append('-DWITH_SETUP_INSTALL=ON')
+        cmake_args += args
+        cmake_args.append(TOP_DIR)
+        print("cmake_args:", cmake_args)
+        subprocess.check_call(cmake_args)
+
+
+def build_run(args, build_path, envrion_var):
+    with cd(build_path):
+        build_args = []
+        build_args.append(CMAKE)
+        build_args += args
+        # cmake_args.append(TOP_DIR)
+        print(" ".join(build_args))
+        try:
+            subprocess.check_call(build_args, cwd=build_path, env=envrion_var)
+        except (CalledProcessError, KeyboardInterrupt) as e:
+            sys.exit(1)
+
+
+def build_steps():
+    print('------- Building start ------')
+    if not os.path.exists(TOP_DIR + '/build'):
+        _mkdir_p(TOP_DIR + '/build')
+    build_path = TOP_DIR + '/build'
+    # run cmake to generate native build files
+    cmake_cache_file_path = os.path.join(build_path, "CMakeCache.txt")
+    # if rerun_cmake is True,remove CMakeCache.txt and rerun camke
+    if os.path.isfile(cmake_cache_file_path) and rerun_cmake is True:
+        os.remove(cmake_cache_file_path)
+    if not os.path.exists(cmake_cache_file_path):
+        env_var = os.environ.copy()  # get env variables
+        paddle_build_options = {}
+        other_options = {}
+        other_options.update(
+            {
+                option: option
+                for option in (
+                    "PYTHON_LIBRARY",
+                    "INFERENCE_DEMO_INSTALL_DIR",
+                    "ON_INFER",
+                    "PYTHON_EXECUTABLE",
+                    "TENSORRT_ROOT",
+                    "CUDA_ARCH_NAME",
+                    "CUDA_ARCH_BIN",
+                    "PYTHON_INCLUDE_DIR",
+                    "PYTHON_LIBRARIES",
+                    "PY_VERSION",
+                    "CUB_PATH",
+                    "NEW_RELEASE_PYPI",
+                    "CUDNN_ROOT",
+                    "THIRD_PARTY_PATH",
+                    "NOAVX_CORE_FILE",
+                    "LITE_GIT_TAG",
+                    "CUDA_TOOLKIT_ROOT_DIR",
+                    "NEW_RELEASE_JIT",
+                    "XPU_SDK_ROOT",
+                    "MSVC_STATIC_CRT",
+                    "Ninja",
+                    "NEW_RELEASE_ALL",
+                )
+            }
+        )
+        # if environment variables which start with "WITH_" or "CMAKE_",put it into build_options
+        for option_key, option_value in env_var.items():
+            if option_key.startswith(("CMAKE_", "WITH_")):
+                paddle_build_options[option_key] = option_value
+            if option_key in other_options:
+                print("type:", type(other_options[option_key]))
+                if (
+                    option_key == 'PYTHON_EXECUTABLE'
+                    or option_key == 'PYTHON_LIBRARY'
+                    or option_key == 'PYTHON_LIBRARIES'
+                ):
+                    key = option_key + ":FILEPATH"
+                    print(key)
+                elif option_key == 'PYTHON_INCLUDE_DIR':
+                    key = key = option_key + ':PATH'
+                    print(key)
+                else:
+                    key = other_options[option_key]
+                if key not in paddle_build_options:
+                    paddle_build_options[key] = option_value
+        args = []
+        options_process(args, paddle_build_options)
+        print("args:", args)
+        cmake_run(args, build_path)
+    # make
+    if only_cmake:
+        print(
+            "You have finished running cmake, the program exited,run 'ccmake build' to adjust build options and 'python setup.py install to build'"
+        )
+        sys.exit()
+    build_args = ["--build", ".", "--target", "install", "--config", 'Release']
+    max_jobs = os.getenv("MAX_JOBS")
+    if max_jobs is not None:
+        max_jobs = max_jobs or str(multiprocessing.cpu_count())
+
+        build_args += ["--"]
+        if IS_WINDOWS:
+            build_args += ["/p:CL_MPCount={}".format(max_jobs)]
+        else:
+            build_args += ["-j", max_jobs]
+    else:
+        build_args += ["-j", str(multiprocessing.cpu_count())]
+    environ_var = os.environ.copy()
+    build_run(build_args, build_path, environ_var)
+
+
+def get_setup_requires():
+    with open(
+        env_dict.get("PADDLE_SOURCE_DIR") + '/python/requirements.txt'
+    ) as f:
+        setup_requires = (
+            f.read().splitlines()
+        )  # Specify the dependencies to install
+    if sys.version_info >= (3, 7):
+        setup_requires_tmp = []
+        for setup_requires_i in setup_requires:
+            if (
+                "<\"3.6\"" in setup_requires_i
+                or "<=\"3.6\"" in setup_requires_i
+                or "<\"3.5\"" in setup_requires_i
+                or "<=\"3.5\"" in setup_requires_i
+                or "<\"3.7\"" in setup_requires_i
+            ):
+                continue
+            setup_requires_tmp += [setup_requires_i]
+        setup_requires = setup_requires_tmp
+        return setup_requires
+    else:
+        raise RuntimeError(
+            "please check your python version,Paddle only support Python version>=3.7 now"
+        )
+
+
+def get_package_data_and_package_dir():
+    if os.name != 'nt':
+        package_data = {
+            'paddle.fluid': [env_dict.get("FLUID_CORE_NAME") + '.so']
+        }
+    else:
+        package_data = {
+            'paddle.fluid': [
+                env_dict.get("FLUID_CORE_NAME") + '.pyd',
+                env_dict.get("FLUID_CORE_NAME") + '.lib',
+            ]
+        }
+    package_data['paddle.fluid'] += [
+        paddle_binary_dir + '/python/paddle/cost_model/static_op_benchmark.json'
+    ]
+    if 'develop' in sys.argv:
+        package_dir = {
+            '': paddle_binary_dir.split('/')[-1] + '/python',
+            # '':'build/python',
+            # The paddle.fluid.proto will be generated while compiling.
+            # So that package points to other directory.
+            'paddle.fluid.proto.profiler': paddle_binary_dir.split('/')[-1]
+            + '/paddle/fluid/platform',
+            'paddle.fluid.proto': paddle_binary_dir.split('/')[-1]
+            + '/paddle/fluid/framework',
+            'paddle.fluid': paddle_binary_dir.split('/')[-1]
+            + '/python/paddle/fluid',
+        }
+    else:
+        package_dir = {
+            '': env_dict.get("PADDLE_BINARY_DIR") + '/python',
+            'paddle.fluid.proto.profiler': env_dict.get("PADDLE_BINARY_DIR")
+            + '/paddle/fluid/platform',
+            'paddle.fluid.proto': env_dict.get("PADDLE_BINARY_DIR")
+            + '/paddle/fluid/framework',
+            'paddle.fluid': env_dict.get("PADDLE_BINARY_DIR")
+            + '/python/paddle/fluid',
+        }
+    # put all thirdparty libraries in paddle.libs
+    libs_path = paddle_binary_dir + '/python/paddle/libs'
+    package_data['paddle.libs'] = []
+    package_data['paddle.libs'] = [
+        ('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_suffix
+    ]
+    shutil.copy(env_dict.get("WARPCTC_LIBRARIES"), libs_path)
+    package_data['paddle.libs'] += [
+        os.path.basename(env_dict.get("LAPACK_LIB")),
+        os.path.basename(env_dict.get("BLAS_LIB")),
+        os.path.basename(env_dict.get("GFORTRAN_LIB")),
+        os.path.basename(env_dict.get("GNU_RT_LIB_1")),
+    ]
+    shutil.copy(env_dict.get("BLAS_LIB"), libs_path)
+    shutil.copy(env_dict.get("LAPACK_LIB"), libs_path)
+    shutil.copy(env_dict.get("GFORTRAN_LIB"), libs_path)
+    shutil.copy(env_dict.get("GNU_RT_LIB_1"), libs_path)
+    if env_dict.get("WITH_CUDNN_DSO") == 'ON' and os.path.exists(
+        env_dict.get("CUDNN_LIBRARY")
+    ):
+        package_data['paddle.libs'] += [
+            os.path.basename(env_dict.get("CUDNN_LIBRARY"))
+        ]
+        shutil.copy(env_dict.get("CUDNN_LIBRARY"), libs_path)
+        if (
+            sys.platform.startswith("linux")
+            and env_dict.get("CUDNN_MAJOR_VERSION") == '8'
+        ):
+            # libcudnn.so includes libcudnn_ops_infer.so, libcudnn_ops_train.so,
+            # libcudnn_cnn_infer.so, libcudnn_cnn_train.so, libcudnn_adv_infer.so,
+            # libcudnn_adv_train.so
+            cudnn_lib_files = glob.glob(
+                os.path.dirname(env_dict.get("CUDNN_LIBRARY"))
+                + '/libcudnn_*so.8'
+            )
+            for cudnn_lib in cudnn_lib_files:
+                if os.path.exists(cudnn_lib):
+                    package_data['paddle.libs'] += [os.path.basename(cudnn_lib)]
+                    shutil.copy(cudnn_lib, libs_path)
+    if not sys.platform.startswith("linux"):
+        package_data['paddle.libs'] += [
+            os.path.basename(env_dict.get("GNU_RT_LIB_2"))
+        ]
+        shutil.copy(env_dict.get("GNU_RT_LIB_2"), libs_path)
+    if env_dict.get("WITH_MKL") == 'ON':
+        shutil.copy(env_dict.get("MKLML_SHARED_LIB"), libs_path)
+        shutil.copy(env_dict.get("MKLML_SHARED_IOMP_LIB"), libs_path)
+        package_data['paddle.libs'] += [
+            ('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_suffix,
+            ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_suffix,
+        ]
+    else:
+        if os.name == 'nt':
+            # copy the openblas.dll
+            shutil.copy(env_dict.get("OPENBLAS_SHARED_LIB"), libs_path)
+            package_data['paddle.libs'] += ['openblas' + ext_suffix]
+        elif (
+            os.name == 'posix'
+            and platform.machine() == 'aarch64'
+            and env_dict.get("OPENBLAS_LIB").endswith('so')
+        ):
+            # copy the libopenblas.so on linux+aarch64
+            # special: libpaddle.so without avx depends on 'libopenblas.so.0', not 'libopenblas.so'
+            if os.path.exists(env_dict.get("OPENBLAS_LIB") + '.0'):
+                shutil.copy(env_dict.get("OPENBLAS_LIB") + '.0', libs_path)
+                package_data['paddle.libs'] += ['libopenblas.so.0']
+
+    if env_dict.get("WITH_LITE") == 'ON':
+        shutil.copy(env_dict.get("LITE_SHARED_LIB"), libs_path)
+        package_data['paddle.libs'] += [
+            'libpaddle_full_api_shared' + ext_suffix
+        ]
+        if env_dict.get("LITE_WITH_NNADAPTER") == 'ON':
+            shutil.copy(env_dict.get("LITE_NNADAPTER_LIB"), libs_path)
+            package_data['paddle.libs'] += ['libnnadapter' + ext_suffix]
+            if env_dict.get("NNADAPTER_WITH_HUAWEI_ASCEND_NPU") == 'ON':
+                shutil.copy(env_dict.get("LITE_NNADAPTER_NPU_LIB"), libs_path)
+                package_data['paddle.libs'] += [
+                    'libnnadapter_driver_huawei_ascend_npu' + ext_suffix
+                ]
+    if env_dict.get("WITH_CINN") == 'ON':
+        shutil.copy(
+            env_dict.get("CINN_LIB_LOCATION")
+            + '/'
+            + env_dict.get("CINN_LIB_NAME"),
+            libs_path,
+        )
+        shutil.copy(
+            env_dict.get("CINN_INCLUDE_DIR")
+            + '/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh',
+            libs_path,
+        )
+        package_data['paddle.libs'] += ['libcinnapi.so']
+        package_data['paddle.libs'] += ['cinn_cuda_runtime_source.cuh']
+        if env_dict.get("CMAKE_BUILD_TYPE") == 'Release' and os.name != 'nt':
+            command = (
+                "patchelf --set-rpath '$ORIGIN/' %s/" % libs_path
+                + env_dict.get("CINN_LIB_NAME")
+            )
+            if os.system(command) != 0:
+                raise Exception(
+                    'patch '
+                    + libs_path
+                    + '/'
+                    + env_dict.get("CINN_LIB_NAME")
+                    + ' failed',
+                    'command: %s' % command,
+                )
+    if env_dict.get("WITH_PSLIB") == 'ON':
+        shutil.copy(env_dict.get("PSLIB_LIB"), libs_path)
+        if os.path.exists(env_dict.get("PSLIB_VERSION_PY")):
+            shutil.copy(
+                env_dict.get("PSLIB_VERSION_PY"),
+                paddle_binary_dir
+                + '/python/paddle/fluid/incubate/fleet/parameter_server/pslib/',
+            )
+        package_data['paddle.libs'] += ['libps' + ext_suffix]
+    if env_dict.get("WITH_MKLDNN") == 'ON':
+        if env_dict.get("CMAKE_BUILD_TYPE") == 'Release' and os.name != 'nt':
+            # only change rpath in Release mode.
+            # TODO(typhoonzero): use install_name_tool to patch mkl libs once
+            # we can support mkl on mac.
+            #
+            # change rpath of libdnnl.so.1, add $ORIGIN/ to it.
+            # The reason is that all thirdparty libraries in the same directory,
+            # thus, libdnnl.so.1 will find libmklml_intel.so and libiomp5.so.
+            command = "patchelf --set-rpath '$ORIGIN/' " + env_dict.get(
+                "MKLDNN_SHARED_LIB"
+            )
+            if os.system(command) != 0:
+                raise Exception(
+                    "patch libdnnl.so failed, command: %s" % command
+                )
+        shutil.copy(env_dict.get("MKLDNN_SHARED_LIB"), libs_path)
+        if os.name != 'nt':
+            shutil.copy(env_dict.get("MKLDNN_SHARED_LIB_1"), libs_path)
+            shutil.copy(env_dict.get("MKLDNN_SHARED_LIB_2"), libs_path)
+            package_data['paddle.libs'] += [
+                'libmkldnn.so.0',
+                'libdnnl.so.1',
+                'libdnnl.so.2',
+            ]
+        else:
+            package_data['paddle.libs'] += ['mkldnn.dll']
+
+    if env_dict.get("WITH_ONNXRUNTIME") == 'ON':
+        shutil.copy(env_dict.get("ONNXRUNTIME_SHARED_LIB"), libs_path)
+        shutil.copy(env_dict.get("PADDLE2ONNX_LIB"), libs_path)
+        if os.name == 'nt':
+            package_data['paddle.libs'] += [
+                'paddle2onnx.dll',
+                'onnxruntime.dll',
+            ]
+        else:
+            package_data['paddle.libs'] += [
+                env_dict.get("PADDLE2ONNX_LIB_NAME"),
+                env_dict.get("ONNXRUNTIME_LIB_NAME"),
+            ]
+
+    if env_dict.get("WITH_XPU") == 'ON':
+        # only change rpath in Release mode,
+        if env_dict.get("CMAKE_BUILD_TYPE") == 'Release':
+            if os.name != 'nt':
+                if env_dict.get("APPLE") == "1":
+                    command = (
+                        "install_name_tool -id \"@loader_path/\" "
+                        + env_dict.get("XPU_API_LIB")
+                    )
+                else:
+                    command = "patchelf --set-rpath '$ORIGIN/' " + env_dict.get(
+                        "XPU_API_LIB"
+                    )
+                if os.system(command) != 0:
+                    raise Exception(
+                        'patch ' + env_dict.get("XPU_API_LIB") + 'failed ,',
+                        "command: %s" % command,
+                    )
+        shutil.copy(env_dict.get("XPU_API_LIB"), libs_path)
+        shutil.copy(env_dict.get("XPU_RT_LIB"), libs_path)
+        package_data['paddle.libs'] += [
+            env_dict.get("XPU_API_LIB_NAME"),
+            env_dict.get("XPU_RT_LIB_NAME"),
+        ]
+
+    if env_dict.get("WITH_XPU_BKCL") == 'ON':
+        shutil.copy(env_dict.get("XPU_BKCL_LIB"), libs_path)
+        package_data['paddle.libs'] += [env_dict.get("XPU_BKCL_LIB_NAME")]
+
+    # remove unused paddle/libs/__init__.py
+    if os.path.isfile(libs_path + '/__init__.py'):
+        os.remove(libs_path + '/__init__.py')
+    package_dir['paddle.libs'] = libs_path
+
+    # change rpath of ${FLUID_CORE_NAME}.ext, add $ORIGIN/../libs/ to it.
+    # The reason is that libwarpctc.ext, libiomp5.ext etc are in paddle.libs, and
+    # ${FLUID_CORE_NAME}.ext is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
+    # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
+    if env_dict.get("CMAKE_BUILD_TYPE") == 'Release':
+        if os.name != 'nt':
+            # only change rpath in Release mode, since in Debug mode, ${FLUID_CORE_NAME}.xx is too large to be changed.
+            if env_dict.get("APPLE") == "1":
+                commands = [
+                    "install_name_tool -id '@loader_path/../libs/' "
+                    + env_dict.get("PADDLE_BINARY_DIR")
+                    + '/python/paddle/fluid/'
+                    + env_dict.get("FLUID_CORE_NAME")
+                    + '.so'
+                ]
+                commands.append(
+                    "install_name_tool -add_rpath '@loader_path/../libs/' "
+                    + env_dict.get("PADDLE_BINARY_DIR")
+                    + '/python/paddle/fluid/'
+                    + env_dict.get("FLUID_CORE_NAME")
+                    + '.so'
+                )
+            else:
+                commands = [
+                    "patchelf --set-rpath '$ORIGIN/../libs/' "
+                    + env_dict.get("PADDLE_BINARY_DIR")
+                    + '/python/paddle/fluid/'
+                    + env_dict.get("FLUID_CORE_NAME")
+                    + '.so'
+                ]
+            # The sw_64 not suppot patchelf, so we just disable that.
+            if platform.machine() != 'sw_64' and platform.machine() != 'mips64':
+                for command in commands:
+                    if os.system(command) != 0:
+                        raise Exception(
+                            'patch '
+                            + env_dict.get("FLUID_CORE_NAME")
+                            + '.%s failed' % ext_suffix,
+                            'command: %s' % command,
+                        )
+    # A list of extensions that specify c++ -written modules that compile source code into dynamically linked libraries
+    ext_modules = [Extension('_foo', [paddle_binary_dir + '/python/stub.cc'])]
+    if os.name == 'nt':
+        # fix the path separator under windows
+        fix_package_dir = {}
+        for k, v in package_dir.items():
+            fix_package_dir[k] = v.replace('/', '\\')
+        package_dir = fix_package_dir
+        ext_modules = []
+    elif sys.platform == 'darwin':
+        ext_modules = []
+    return package_data, package_dir, ext_modules
+
+
+def get_headers():
+    headers = (
+        # paddle level api headers
+        list(find_files('*.h', paddle_source_dir + '/paddle'))
+        + list(find_files('*.h', paddle_source_dir + '/paddle/phi/api'))
+        + list(  # phi unify api header
+            find_files('*.h', paddle_source_dir + '/paddle/phi/api/ext')
+        )
+        + list(  # custom op api
+            find_files('*.h', paddle_source_dir + '/paddle/phi/api/include')
+        )
+        + list(  # phi api
+            find_files('*.h', paddle_source_dir + '/paddle/phi/common')
+        )
+        + list(
+            find_files('*.h', paddle_source_dir + '/paddle/phi')
+        )  # phi common headers
+        # phi level api headers (low level api)
+        + list(  # phi extension header
+            find_files(
+                '*.h', paddle_source_dir + '/paddle/phi/include', recursive=True
+            )
+        )
+        + list(  # phi include headers
+            find_files(
+                '*.h',
+                paddle_source_dir + '/paddle/phi/backends',
+                recursive=True,
+            )
+        )
+        + list(  # phi backends headers
+            find_files(
+                '*.h', paddle_source_dir + '/paddle/phi/core', recursive=True
+            )
+        )
+        + list(  # phi core headers
+            find_files(
+                '*.h',
+                paddle_source_dir + '/paddle/phi/infermeta',
+                recursive=True,
+            )
+        )
+        + list(  # phi infermeta headers
+            find_files('*.h', paddle_source_dir + '/paddle/phi/kernels')
+        )
+        + list(  # phi kernels headers
+            find_files('*.h', paddle_source_dir + '/paddle/phi/kernels/sparse')
+        )
+        + list(  # phi sparse kernels headers
+            find_files(
+                '*.h', paddle_source_dir + '/paddle/phi/kernels/selected_rows'
+            )
+        )
+        + list(  # phi selected_rows kernels headers
+            find_files('*.h', paddle_source_dir + '/paddle/phi/kernels/strings')
+        )
+        + list(  # phi sparse kernels headers
+            find_files(
+                '*.h', paddle_source_dir + '/paddle/phi/kernels/primitive'
+            )
+        )
+        + list(  # phi kernel primitive api headers
+            # capi headers
+            find_files(
+                '*.h', paddle_source_dir + '/paddle/phi/capi', recursive=True
+            )
+        )
+        + list(  # phi capi headers
+            # profiler headers
+            find_files(
+                'trace_event.h',
+                paddle_source_dir + '/paddle/fluid/platform/profiler',
+            )
+        )
+        + list(  # phi profiler headers
+            # utils api headers
+            find_files(
+                '*.h', paddle_source_dir + '/paddle/utils', recursive=True
+            )
+        )
+    )  # paddle utils headers
+
+    jit_layer_headers = [
+        'layer.h',
+        'serializer.h',
+        'serializer_utils.h',
+        'all.h',
+        'function.h',
+    ]
+
+    for f in jit_layer_headers:
+        headers += list(
+            find_files(
+                f, paddle_source_dir + '/paddle/fluid/jit', recursive=True
+            )
+        )
+
+    if env_dict.get("WITH_MKLDNN") == 'ON':
+        headers += list(
+            find_files('*', env_dict.get("MKLDNN_INSTALL_DIR") + '/include')
+        )  # mkldnn
+
+    if env_dict.get("WITH_GPU") == 'ON' or env_dict.get("WITH_ROCM") == 'ON':
+        # externalErrorMsg.pb for External Error message
+        headers += list(
+            find_files('*.pb', env_dict.get("externalError_INCLUDE_DIR"))
+        )
+    return headers
+
+
+def get_setup_parameters():
+    # get setup_requires
+    setup_requires = get_setup_requires()
+    packages = [
+        'paddle',
+        'paddle.libs',
+        'paddle.utils',
+        'paddle.utils.gast',
+        'paddle.utils.cpp_extension',
+        'paddle.dataset',
+        'paddle.reader',
+        'paddle.distributed',
+        'paddle.distributed.communication',
+        'paddle.distributed.communication.stream',
+        'paddle.distributed.metric',
+        'paddle.distributed.ps',
+        'paddle.distributed.ps.utils',
+        'paddle.incubate',
+        'paddle.incubate.autograd',
+        'paddle.incubate.optimizer',
+        'paddle.incubate.checkpoint',
+        'paddle.incubate.operators',
+        'paddle.incubate.tensor',
+        'paddle.incubate.multiprocessing',
+        'paddle.incubate.nn',
+        'paddle.incubate.asp',
+        'paddle.incubate.passes',
+        'paddle.distribution',
+        'paddle.distributed.utils',
+        'paddle.distributed.sharding',
+        'paddle.distributed.fleet',
+        'paddle.distributed.launch',
+        'paddle.distributed.launch.context',
+        'paddle.distributed.launch.controllers',
+        'paddle.distributed.launch.job',
+        'paddle.distributed.launch.plugins',
+        'paddle.distributed.launch.utils',
+        'paddle.distributed.fleet.base',
+        'paddle.distributed.fleet.recompute',
+        'paddle.distributed.fleet.elastic',
+        'paddle.distributed.fleet.meta_optimizers',
+        'paddle.distributed.fleet.meta_optimizers.sharding',
+        'paddle.distributed.fleet.meta_optimizers.ascend',
+        'paddle.distributed.fleet.meta_optimizers.dygraph_optimizer',
+        'paddle.distributed.fleet.runtime',
+        'paddle.distributed.rpc',
+        'paddle.distributed.fleet.dataset',
+        'paddle.distributed.fleet.data_generator',
+        'paddle.distributed.fleet.metrics',
+        'paddle.distributed.fleet.proto',
+        'paddle.distributed.fleet.utils',
+        'paddle.distributed.fleet.layers',
+        'paddle.distributed.fleet.layers.mpu',
+        'paddle.distributed.fleet.meta_parallel',
+        'paddle.distributed.fleet.meta_parallel.pp_utils',
+        'paddle.distributed.fleet.meta_parallel.sharding',
+        'paddle.distributed.fleet.meta_parallel.parallel_layers',
+        'paddle.distributed.auto_parallel',
+        'paddle.distributed.auto_parallel.operators',
+        'paddle.distributed.auto_parallel.tuner',
+        'paddle.distributed.auto_parallel.cost',
+        'paddle.distributed.passes',
+        'paddle.distributed.models',
+        'paddle.distributed.models.moe',
+        'paddle.framework',
+        'paddle.jit',
+        'paddle.jit.dy2static',
+        'paddle.inference',
+        'paddle.inference.contrib',
+        'paddle.inference.contrib.utils',
+        'paddle.fluid',
+        'paddle.fluid.dygraph',
+        'paddle.fluid.dygraph.amp',
+        'paddle.fluid.proto',
+        'paddle.fluid.proto.profiler',
+        'paddle.fluid.distributed',
+        'paddle.fluid.layers',
+        'paddle.fluid.dataloader',
+        'paddle.fluid.contrib',
+        'paddle.fluid.contrib.quantize',
+        'paddle.fluid.contrib.slim',
+        'paddle.fluid.contrib.slim.quantization',
+        'paddle.fluid.contrib.slim.quantization.imperative',
+        'paddle.fluid.contrib.extend_optimizer',
+        'paddle.fluid.contrib.mixed_precision',
+        'paddle.fluid.contrib.mixed_precision.bf16',
+        'paddle.fluid.contrib.layers',
+        'paddle.fluid.contrib.sparsity',
+        'paddle.fluid.transpiler',
+        'paddle.fluid.transpiler.details',
+        'paddle.fluid.incubate',
+        'paddle.fluid.incubate.data_generator',
+        'paddle.fluid.incubate.fleet',
+        'paddle.fluid.incubate.checkpoint',
+        'paddle.fluid.incubate.fleet.base',
+        'paddle.fluid.incubate.fleet.parameter_server',
+        'paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler',
+        'paddle.fluid.incubate.fleet.parameter_server.pslib',
+        'paddle.fluid.incubate.fleet.parameter_server.ir',
+        'paddle.fluid.incubate.fleet.collective',
+        'paddle.fluid.incubate.fleet.utils',
+        'paddle.amp',
+        'paddle.cost_model',
+        'paddle.hapi',
+        'paddle.vision',
+        'paddle.vision.models',
+        'paddle.vision.transforms',
+        'paddle.vision.datasets',
+        'paddle.audio',
+        'paddle.audio.functional',
+        'paddle.audio.features',
+        'paddle.audio.datasets',
+        'paddle.audio.backends',
+        'paddle.text',
+        'paddle.text.datasets',
+        'paddle.incubate',
+        'paddle.incubate.nn',
+        'paddle.incubate.nn.functional',
+        'paddle.incubate.nn.layer',
+        'paddle.incubate.optimizer.functional',
+        'paddle.incubate.autograd',
+        'paddle.incubate.distributed',
+        'paddle.incubate.distributed.utils',
+        'paddle.incubate.distributed.utils.io',
+        'paddle.incubate.distributed.fleet',
+        'paddle.incubate.distributed.models',
+        'paddle.incubate.distributed.models.moe',
+        'paddle.incubate.distributed.models.moe.gate',
+        'paddle.sparse',
+        'paddle.sparse.nn',
+        'paddle.sparse.nn.layer',
+        'paddle.sparse.nn.functional',
+        'paddle.incubate.xpu',
+        'paddle.io',
+        'paddle.optimizer',
+        'paddle.nn',
+        'paddle.nn.functional',
+        'paddle.nn.layer',
+        'paddle.nn.quant',
+        'paddle.nn.initializer',
+        'paddle.nn.utils',
+        'paddle.metric',
+        'paddle.static',
+        'paddle.static.nn',
+        'paddle.static.amp',
+        'paddle.static.sparsity',
+        'paddle.tensor',
+        'paddle.onnx',
+        'paddle.autograd',
+        'paddle.device',
+        'paddle.device.cuda',
+        'paddle.device.xpu',
+        'paddle.version',
+        'paddle.profiler',
+        'paddle.geometric',
+        'paddle.geometric.message_passing',
+        'paddle.geometric.sampling',
+    ]
+
+    paddle_bins = ''
+    if not env_dict.get("WIN32"):
+        paddle_bins = [
+            env_dict.get("PADDLE_BINARY_DIR") + '/paddle/scripts/paddle'
+        ]
+    package_data, package_dir, ext_modules = get_package_data_and_package_dir()
+    headers = get_headers()
+    return (
+        setup_requires,
+        packages,
+        paddle_bins,
+        package_data,
+        package_dir,
+        ext_modules,
+        headers,
+    )
+
+
+def main():
+    # Parse the command line and check arguments before we proceed with building steps and setup
+    parse_input_command(filter_args_list)
+
+    # Execute the build process,cmake and make
+    if cmake_and_build:
+        build_steps()
+
+    sys.path.append(TOP_DIR + "/build/python/")
+    from build.python.env_dict import env_dict as env_dict
+
+    global env_dict
+    global paddle_binary_dir, paddle_source_dir
+    paddle_binary_dir = env_dict.get("PADDLE_BINARY_DIR")
+    paddle_source_dir = env_dict.get("PADDLE_SOURCE_DIR")
+
+    # preparing parameters for setup()
+    paddle_version = env_dict.get("PADDLE_VERSION")
+    package_name = env_dict.get("PACKAGE_NAME")
+    write_version_py(
+        filename='{}/python/paddle/version/__init__.py'.format(
+            paddle_binary_dir
+        )
+    )
+    write_cuda_env_config_py(
+        filename='{}/python/paddle/cuda_env.py'.format(paddle_binary_dir)
+    )
+    write_parameter_server_version_py(
+        filename='{}/python/paddle/fluid/incubate/fleet/parameter_server/version.py'.format(
+            paddle_binary_dir
+        )
+    )
+
+    (
+        setup_requires,
+        packages,
+        scripts,
+        package_data,
+        package_dir,
+        ext_modules,
+        headers,
+    ) = get_setup_parameters()
+
+    # Log for PYPI, get long_description of setup()
+    with open(
+        paddle_source_dir + '/python/paddle/README.rst', "r", encoding='UTF-8'
+    ) as f:
+        long_description = f.read()
+
+    # strip *.so to reduce package size
+    if env_dict.get("WITH_STRIP") == 'ON':
+        command = (
+            'find '
+            + paddle_binary_dir
+            + '/python/paddle -name "*.so" | xargs -i strip {}'
+        )
+        if os.system(command) != 0:
+            raise Exception("strip *.so failed, command: %s" % command)
+
+    setup(
+        name=package_name,
+        version=paddle_version,
+        description='Parallel Distributed Deep Learning',
+        long_description=long_description,
+        long_description_content_type="text/markdown",
+        author_email="Paddle-better@baidu.com",
+        maintainer="PaddlePaddle",
+        maintainer_email="Paddle-better@baidu.com",
+        url='https://www.paddlepaddle.org.cn/',
+        download_url='https://github.com/paddlepaddle/paddle',
+        license='Apache Software License',
+        packages=packages,
+        install_requires=setup_requires,
+        ext_modules=ext_modules,
+        package_data=package_data,
+        package_dir=package_dir,
+        scripts=scripts,
+        distclass=BinaryDistribution,
+        headers=headers,
+        cmdclass={
+            'install_headers': InstallHeaders,
+            'install': InstallCommand,
+            'egg_info': EggInfo,
+            'install_lib': InstallLib,
+        },
+        entry_points={
+            'console_scripts': [
+                'fleetrun = paddle.distributed.launch.main:launch'
+            ]
+        },
+        classifiers=[
+            'Development Status :: 5 - Production/Stable',
+            'Operating System :: OS Independent',
+            'Intended Audience :: Developers',
+            'Intended Audience :: Education',
+            'Intended Audience :: Science/Research',
+            'License :: OSI Approved :: Apache Software License',
+            'Programming Language :: C++',
+            'Programming Language :: Python :: 2.7',
+            'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
+            'Programming Language :: Python :: 3.8',
+        ],
+    )
+
+
+if __name__ == '__main__':
+    main()

From cb812f40e1502966d985483ce9fbf6184e1795b0 Mon Sep 17 00:00:00 2001
From: yunyaoXYY <109218879+yunyaoXYY@users.noreply.github.com>
Date: Mon, 5 Dec 2022 14:27:53 +0800
Subject: [PATCH 11/13] [Clean fluid] Clean hash, grid_sampler, log_loss,
 bilinear_tensor_product. (#48411)

* Clean fliud hash

* clean fluid grid_sampler

* clean log_loss

* Move bilinear_tensor_product from fluid to static

* Fix unitests when remove log_loss

* Fix bug when move bilinear_tensor_product

* fix test_fleet_nocvm_1.py

* Add bilinear_tensor_product into all list

* Fix code style

* Fix comments in bilinear_tensor_product

* Fix comments in bilinear_tensor_product

* Fix comments
---
 python/paddle/fluid/layers/nn.py              | 290 ----------------
 .../unittests/npu/test_log_loss_op_npu.py     |  32 --
 .../fluid/tests/unittests/test_fleet.py       |   2 +-
 .../tests/unittests/test_fleet_nocvm_1.py     |   4 +-
 .../tests/unittests/test_fleet_rolemaker.py   |   3 +-
 .../tests/unittests/test_fleet_rolemaker_2.py |   2 +-
 .../tests/unittests/test_fleet_rolemaker_3.py |   4 +-
 .../unittests/test_fleet_unitaccessor.py      |   4 +-
 .../fluid/tests/unittests/test_hash_op.py     |  41 ---
 .../tests/unittests/test_imperative_deepcf.py |   8 +-
 .../test_imperative_load_static_param.py      |  14 +-
 .../fluid/tests/unittests/test_layers.py      |  17 +-
 .../fluid/tests/unittests/test_log_loss_op.py |  31 --
 python/paddle/static/nn/__init__.py           |   4 +-
 python/paddle/static/nn/common.py             | 321 ++++++++++--------
 15 files changed, 211 insertions(+), 566 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 56765c19cba65..34c6387a1643c 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -96,10 +96,6 @@
     'clip_by_norm',
     'mean',
     'mul',
-    'hash',
-    'grid_sampler',
-    'log_loss',
-    'bilinear_tensor_product',
     'merge_selected_rows',
     'get_tensor_from_selected_rows',
     'unfold',
@@ -5223,292 +5219,6 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
     return out
 
 
-def hash(input, hash_size, num_hash=1, name=None):
-    """
-
-    This OP hash the input to an integer less than the hash_size.
-    The hash algorithm we used was xxHash - Extremely fast hash algorithm
-    (https://github.com/Cyan4973/xxHash/tree/v0.6.5)
-
-    Args:
-        input(Variable): A **Two-Dimensional** LoDTensor with type int32, int64.
-             **Only support LoDTensor**.
-        num_hash(int, optional): The times of hash, default is 1.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
-
-    Returns:
-       Variable: A LoDTensor with the same data type as input.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy as np
-            import paddle
-            paddle.enable_static()
-
-            place = fluid.core.CPUPlace()
-
-            x = fluid.data(name="x", shape=[2,2], dtype="int32", lod_level=1)
-            res = fluid.layers.hash(name="res", input=x, hash_size=1000, num_hash=4)
-
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            in1 = np.array([[1,2],[3,4]]).astype("int32")
-            print(in1)
-            x_i = fluid.create_lod_tensor(in1, [[0, 2]], place)
-            res = exe.run(fluid.default_main_program(), feed={'x':x_i}, fetch_list=[res], return_numpy=False)
-            print(np.array(res[0]))
-            # [[[722]
-            #   [407]
-            #   [337]
-            #   [395]]
-            #  [[603]
-            #   [590]
-            #   [386]
-            #   [901]]]
-    """
-    check_variable_and_dtype(input, 'input', ['int32', 'int64'], 'hash')
-    check_type(hash_size, 'hash_size', int, 'hash')
-    check_type(num_hash, 'num_hash', int, 'hash')
-    helper = LayerHelper('hash', **locals())
-    out = helper.create_variable_for_type_inference(
-        helper.input_dtype(), stop_gradient=True
-    )
-    helper.append_op(
-        type='hash',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={'num_hash': num_hash, 'mod_by': hash_size},
-    )
-    return out
-
-
-@templatedoc()
-def grid_sampler(x, grid, name=None):
-    """
-
-    This operation samples input X by using bilinear interpolation based on
-    flow field grid, which is usually generated by :code:`affine_grid` . The grid of
-    shape [N, H, W, 2] is the concatenation of (x, y) coordinates
-    with shape [N, H, W] each, where x is indexing the 4th dimension
-    (in width dimension) of input data x and y is indexing the 3rd
-    dimension (in height dimension), finally results is the bilinear
-    interpolation value of 4 nearest corner points. The output tensor
-    shape will be [N, C, H, W].
-
-    .. code-block:: text
-
-        Step 1:
-        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
-
-        .. code-block:: text
-
-            grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
-            grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
-
-        Step 2:
-        Indices input data X with grid (x, y) in each [H, W] area, and bilinear
-        interpolate point value by 4 nearest points.
-
-          wn ------- y_n ------- en
-          |           |           |
-          |          d_n          |
-          |           |           |
-         x_w --d_w-- grid--d_e-- x_e
-          |           |           |
-          |          d_s          |
-          |           |           |
-          ws ------- y_s ------- wn
-
-        x_w = floor(x)              // west side x coord
-        x_e = x_w + 1               // east side x coord
-        y_n = floor(y)              // north side y coord
-        y_s = y_s + 1               // south side y coord
-
-        d_w = grid_x - x_w          // distance to west side
-        d_e = x_e - grid_x          // distance to east side
-        d_n = grid_y - y_n          // distance to north side
-        d_s = y_s - grid_y          // distance to south side
-
-        wn = X[:, :, y_n, x_w]      // north-west point value
-        en = X[:, :, y_n, x_e]      // north-east point value
-        ws = X[:, :, y_s, x_w]      // south-east point value
-        es = X[:, :, y_s, x_w]      // north-east point value
-
-        output = wn * d_e * d_s + en * d_w * d_s
-               + ws * d_e * d_n + es * d_w * d_n
-
-    Args:
-        x(Variable): The input tensor, which is a 4-D tensor with shape
-                     [N, C, H, W], N is the batch size, C is the channel
-                     number, H and W is the feature height and width.
-                     The data type is float32 or float64.
-        grid(Variable): Input grid tensor of shape [N, H, W, 2]. The
-                        data type is float32 or float64.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-
-    Returns:
-        Variable: Output of shape [N, C, H, W] data samples input X
-                  using bilnear interpolation based on input grid.
-                  The data type is same as input tensor.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid as fluid
-            import paddle
-
-            paddle.enable_static()
-            # use with affine_grid
-            x = fluid.data(name='x', shape=[None, 10, 32, 32], dtype='float32')
-            theta = fluid.layers.data(name='theta', shape=[2, 3], dtype='float32')
-            grid = fluid.layers.affine_grid(theta=theta, out_shape=[3, 10, 32, 32])
-            out = fluid.layers.grid_sampler(x=x, grid=grid)
-
-    """
-    helper = LayerHelper("grid_sampler", **locals())
-
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'grid_sampler')
-    check_variable_and_dtype(
-        grid, 'grid', ['float32', 'float64'], 'grid_sampler'
-    )
-    if not isinstance(x, Variable):
-        return ValueError("The x should be a Variable")
-
-    if not isinstance(grid, Variable):
-        return ValueError("The grid should be a Variable")
-
-    out = helper.create_variable_for_type_inference(x.dtype)
-    ipts = {'X': x, 'Grid': grid}
-
-    attrs = {'use_cudnn': False} if core.is_compiled_with_rocm() else {}
-
-    helper.append_op(
-        type='grid_sampler', inputs=ipts, outputs={'Output': out}, attrs=attrs
-    )
-    return out
-
-
-def log_loss(input, label, epsilon=1e-4, name=None):
-    r"""
-
-    **Negative Log Loss Layer**
-
-    This layer accepts input predictions and target label and returns the
-    negative log loss.
-
-    .. math::
-
-        Out = -label * \log{(input + \epsilon)}
-              - (1 - label) * \log{(1 - input + \epsilon)}
-
-    Args:
-        input (Tensor|list):  A 2-D tensor with shape [N x 1], where N is the
-                                batch size. This input is a probability computed
-                                by the previous operator. Data type float32.
-        label (Tensor|list):  The ground truth which is a 2-D tensor with
-                                shape [N x 1], where N is the batch size.
-                                Data type float32.
-        epsilon (float, optional): A small number for numerical stability. Default 1e-4.
-        name(str|None): For detailed information, please refer to
-            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
-
-    Returns:
-        Tensor, which shape is [N x 1], data type is float32.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle
-          import paddle.nn.functional as F
-
-          label = paddle.randn((10,1))
-          prob = paddle.randn((10,1))
-          cost = F.log_loss(input=prob, label=label)
-    """
-    return paddle.nn.functional.log_loss(input, label, epsilon, name)
-
-
-def bilinear_tensor_product(
-    x, y, size, act=None, name=None, param_attr=None, bias_attr=None
-):
-    r"""
-    :api_attr: Static Graph
-
-    **Bilinear Tensor Product Layer**
-
-    This layer performs bilinear tensor product on two inputs.
-    For example:
-
-    .. math::
-       out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
-
-    In this formula:
-      - :math:`x`: the first input contains M elements, shape is [batch_size, M].
-      - :math:`y`: the second input contains N elements, shape is [batch_size, N].
-      - :math:`W_{i}`: the i-th learned weight, shape is [M, N].
-      - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size].
-      - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
-
-    Args:
-        x (Variable): 2-D input tensor with shape [batch_size, M]. Data type
-            is float32 or float64.
-        y (Variable): 2-D input tensor with shape [batch_size, N]. Data type
-            should be same as **x**.
-        size (int): The dimension of this layer.
-        act (str|None): Activation to be applied to the output of this layer. Default None.
-        name(str|None): For detailed information, please refer to
-            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
-        param_attr (ParamAttr|None): To specify the weight parameter attribute.
-            Default: None, which means the default weight parameter property is
-            used. See usage for details in :ref:`api_fluid_ParamAttr` .
-        bias_attr (ParamAttr|None): To specify the bias parameter attribute.
-            Default: None, which means the default bias parameter property is
-            used. See usage for details in :ref:`api_fluid_ParamAttr` .
-    Returns:
-        Variable: A 2-D Tensor of shape [batch_size, size]. Data type is the same as input **x**.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            paddle.enable_static()
-            layer1 = paddle.static.data("t1", shape=[-1, 5], dtype="float32")
-            layer2 = paddle.static.data("t2", shape=[-1, 4], dtype="float32")
-            tensor = paddle.static.nn.bilinear_tensor_product(x=layer1, y=layer2, size=1000)
-    """
-    helper = LayerHelper('bilinear_tensor_product', **locals())
-    dtype = helper.input_dtype('x')
-
-    param_shape = [size, x.shape[1], y.shape[1]]
-
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=False
-    )
-    out = helper.create_variable_for_type_inference(dtype=dtype)
-
-    inputs = {"X": x, "Y": y, "Weight": w}
-    if helper.bias_attr:
-        bias_size = [1, size]
-        bias = helper.create_parameter(
-            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True
-        )
-        inputs["Bias"] = bias
-    helper.append_op(
-        type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out}
-    )
-
-    # add activation
-    return helper.append_activation(out)
-
-
 @templatedoc()
 def get_tensor_from_selected_rows(x, name=None):
     """
diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_loss_op_npu.py
index 87cd872e8cc91..c47b42ee125be 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_log_loss_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_log_loss_op_npu.py
@@ -76,37 +76,5 @@ def test_check_grad(self):
         self.check_grad_with_place(self.place, ['Predicted'], 'Loss')
 
 
-@unittest.skipIf(
-    not paddle.is_compiled_with_npu(), "core is not compiled with NPU"
-)
-class TestLogLossOpError(unittest.TestCase):
-    def test_errors(self):
-        with fluid.program_guard(fluid.Program()):
-
-            def test_x_type():
-                input_data = np.random.random(100, 1).astype("float32")
-                fluid.layers.log_loss(input_data)
-
-            self.assertRaises(TypeError, test_x_type)
-
-            def test_x_dtype():
-                x2 = fluid.layers.data(name='x2', shape=[100, 1], dtype='int32')
-                fluid.layers.log_loss(x2)
-
-            self.assertRaises(TypeError, test_x_dtype)
-
-            def test_label_type():
-                input_data = np.random.random(100, 1).astype("float32")
-                fluid.layers.log_loss(input_data)
-
-            self.assertRaises(TypeError, test_label_type)
-
-            def test_label_dtype():
-                x2 = fluid.layers.data(name='x2', shape=[100, 1], dtype='int32')
-                fluid.layers.log_loss(x2)
-
-            self.assertRaises(TypeError, test_label_dtype)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet.py b/python/paddle/fluid/tests/unittests/test_fleet.py
index d0445c2c5e09b..6092710a798c0 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet.py
@@ -79,7 +79,7 @@ def test_pslib_1(self):
                 append_batch_size=False,
             )
             label_cast = fluid.layers.cast(label, dtype='float32')
-            cost = fluid.layers.log_loss(fc, label_cast)
+            cost = paddle.nn.functional.log_loss(fc, label_cast)
         try:
             adam = fluid.optimizer.Adam(learning_rate=0.000005)
             adam = fleet.distributed_optimizer(
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py b/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py
index 577652037e538..f5975ae990d70 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py
@@ -16,6 +16,8 @@
 import os
 import unittest
 
+import paddle
+
 
 class TestFleet1(unittest.TestCase):
     """
@@ -73,7 +75,7 @@ def test_pslib_1(self):
                 append_batch_size=False,
             )
             label_cast = fluid.layers.cast(label, dtype='float32')
-            cost = fluid.layers.log_loss(fc, label_cast)
+            cost = paddle.nn.functional.log_loss(fc, label_cast)
         try:
             adam = fluid.optimizer.Adam(learning_rate=0.000005)
             adam = fleet.distributed_optimizer(
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
index f64d8cb1692b2..daee01f38f742 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
@@ -16,6 +16,7 @@
 import os
 import unittest
 
+import paddle
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 
 
@@ -97,7 +98,7 @@ def test_pslib_1(self):
                 append_batch_size=False,
             )
             label_cast = fluid.layers.cast(label, dtype='float32')
-            cost = fluid.layers.log_loss(fc, label_cast)
+            cost = paddle.nn.functional.log_loss(fc, label_cast)
         try:
             adam = fluid.optimizer.Adam(learning_rate=0.000005)
             adam = fleet.distributed_optimizer(adam)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
index a657d3deb51a0..7a6ba4248352a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
@@ -79,7 +79,7 @@ def test_pslib_2(self):
                 append_batch_size=False,
             )
             label_cast = fluid.layers.cast(label, dtype='float32')
-            cost = fluid.layers.log_loss(fc, label_cast)
+            cost = paddle.nn.functional.log_loss(fc, label_cast)
         try:
             adam = fluid.optimizer.Adam(learning_rate=0.000005)
             adam = fleet.distributed_optimizer(adam)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py
index 79b5e136f189a..c3df410610ba9 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py
@@ -16,6 +16,8 @@
 import os
 import unittest
 
+import paddle
+
 
 class TestCloudRoleMaker(unittest.TestCase):
     """
@@ -70,7 +72,7 @@ def test_pslib_1(self):
                 append_batch_size=False,
             )
             label_cast = fluid.layers.cast(label, dtype='float32')
-            cost = fluid.layers.log_loss(fc, label_cast)
+            cost = paddle.nn.functional.log_loss(fc, label_cast)
         try:
             adam = fluid.optimizer.Adam(learning_rate=0.000005)
             adam = fleet.distributed_optimizer(adam)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_unitaccessor.py b/python/paddle/fluid/tests/unittests/test_fleet_unitaccessor.py
index 9c7736a39384f..78c4a4541e3c0 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_unitaccessor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_unitaccessor.py
@@ -16,6 +16,8 @@
 import os
 import unittest
 
+import paddle
+
 
 class TestFleet1(unittest.TestCase):
     """
@@ -73,7 +75,7 @@ def test_pslib_1(self):
                 append_batch_size=False,
             )
             label_cast = fluid.layers.cast(label, dtype='float32')
-            cost = fluid.layers.log_loss(fc, label_cast)
+            cost = paddle.nn.functional.log_loss(fc, label_cast)
 
         strategy = {}
         strategy["embedding"] = {}
diff --git a/python/paddle/fluid/tests/unittests/test_hash_op.py b/python/paddle/fluid/tests/unittests/test_hash_op.py
index 53b1551c7b844..75ddd7bb89c8c 100644
--- a/python/paddle/fluid/tests/unittests/test_hash_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hash_op.py
@@ -17,8 +17,6 @@
 import numpy as np
 from op_test import OpTest
 
-import paddle.fluid as fluid
-
 
 class TestHashOp(OpTest):
     def setUp(self):
@@ -120,44 +118,5 @@ def test_check_output(self):
         self.check_output()
 
 
-class TestHashOpError(unittest.TestCase):
-    def test_errors(self):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input_data = np.random.randint(0, 10, (8, 1)).astype("int32")
-
-            def test_Variable():
-                # the input type must be Variable
-                fluid.layers.hash(input=input_data, hash_size=2**32)
-
-            self.assertRaises(TypeError, test_Variable)
-
-            def test_type():
-                # dtype must be int32, int64.
-                x2 = fluid.layers.data(
-                    name='x2', shape=[1], dtype="float32", lod_level=1
-                )
-                fluid.layers.hash(input=x2, hash_size=2**32)
-
-            self.assertRaises(TypeError, test_type)
-
-            def test_hash_size_type():
-                # hash_size dtype must be int32, int64.
-                x3 = fluid.layers.data(
-                    name='x3', shape=[1], dtype="int32", lod_level=1
-                )
-                fluid.layers.hash(input=x3, hash_size=1024.5)
-
-            self.assertRaises(TypeError, test_hash_size_type)
-
-            def test_num_hash_type():
-                # num_hash dtype must be int32, int64.
-                x4 = fluid.layers.data(
-                    name='x4', shape=[1], dtype="int32", lod_level=1
-                )
-                fluid.layers.hash(input=x4, hash_size=2**32, num_hash=2.5)
-
-            self.assertRaises(TypeError, test_num_hash_type)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index c4e280ea46fd0..ecb4600163c4b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -271,7 +271,7 @@ def test_deefcf(self):
 
             deepcf = DeepCF(num_users, num_items, matrix)
             prediction = deepcf(users, items)
-            loss = paddle.sum(fluid.layers.log_loss(prediction, labels))
+            loss = paddle.sum(paddle.nn.functional.log_loss(prediction, labels))
             adam = fluid.optimizer.AdamOptimizer(0.01)
             adam.minimize(loss)
 
@@ -325,7 +325,7 @@ def test_deefcf(self):
                         to_variable(items_np[slice : slice + self.batch_size]),
                     )
                     loss = paddle.sum(
-                        fluid.layers.log_loss(
+                        paddle.nn.functional.log_loss(
                             prediction,
                             to_variable(
                                 labels_np[slice : slice + self.batch_size]
@@ -359,7 +359,7 @@ def test_deefcf(self):
                         to_variable(items_np[slice : slice + self.batch_size]),
                     )
                     loss2 = paddle.sum(
-                        fluid.layers.log_loss(
+                        paddle.nn.functional.log_loss(
                             prediction2,
                             to_variable(
                                 labels_np[slice : slice + self.batch_size]
@@ -402,7 +402,7 @@ def test_deefcf(self):
                             ),
                         )
                         loss = paddle.sum(
-                            fluid.layers.log_loss(
+                            paddle.nn.functional.log_loss(
                                 prediction,
                                 to_variable(
                                     labels_np[slice : slice + self.batch_size]
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
index 169269cc03e31..a98d9b994b33a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
@@ -86,11 +86,15 @@ def testLoadStaticModel(self):
             "t2", shape=[None, 4], dtype="float32"
         )
 
-        bilinear_tensor_pro_out_1 = fluid.layers.bilinear_tensor_product(
-            x=bilinear_tensor_pro_x, y=bilinear_tensor_pro_y, size=1000
-        )
-        bilinear_tensor_pro_out_2 = fluid.layers.bilinear_tensor_product(
-            x=bilinear_tensor_pro_x, y=bilinear_tensor_pro_y, size=1000
+        bilinear_tensor_pro_out_1 = (
+            paddle.static.nn.common.bilinear_tensor_product(
+                x=bilinear_tensor_pro_x, y=bilinear_tensor_pro_y, size=1000
+            )
+        )
+        bilinear_tensor_pro_out_2 = (
+            paddle.static.nn.common.bilinear_tensor_product(
+                x=bilinear_tensor_pro_x, y=bilinear_tensor_pro_y, size=1000
+            )
         )
 
         conv2d_trans_in = fluid.data(
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 62def4247037f..3f7edb6022a85 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -750,7 +750,7 @@ def test_bilinear_tensor_product(self):
             data_y = layers.data(
                 name='y', shape=[1, 3], dtype="float32", append_batch_size=False
             )
-            out = layers.bilinear_tensor_product(
+            out = paddle.static.nn.common.bilinear_tensor_product(
                 data_x,
                 data_y,
                 6,
@@ -825,7 +825,7 @@ def test_bilinear_tensor_product(self):
             data_y2 = layers.data(
                 name='y', shape=[1, 3], dtype="float32", append_batch_size=False
             )
-            out2 = layers.bilinear_tensor_product(
+            out2 = paddle.static.nn.common.bilinear_tensor_product(
                 data_x2, data_y2, 6, act='sigmoid'
             )
 
@@ -3418,15 +3418,6 @@ def make_iou_similarity(self):
             out = layers.iou_similarity(x, y, name='iou_similarity')
             return out
 
-    def make_grid_sampler(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            x = self._get_data(name='x', shape=[3, 5, 7], dtype='float32')
-            grid = self._get_data(name='grid', shape=[5, 7, 2], dtype='float32')
-            out = layers.grid_sampler(x, grid)
-            return out
-
     def make_bilinear_tensor_product_layer(self):
         with program_guard(
             fluid.default_main_program(), fluid.default_startup_program()
@@ -3434,7 +3425,9 @@ def make_bilinear_tensor_product_layer(self):
             data = self._get_data(name='data', shape=[4], dtype="float32")
 
             theta = self._get_data(name="theta", shape=[5], dtype="float32")
-            out = layers.bilinear_tensor_product(data, theta, 6)
+            out = paddle.static.nn.common.bilinear_tensor_product(
+                data, theta, 6
+            )
             return out
 
     def make_batch_norm(self):
diff --git a/python/paddle/fluid/tests/unittests/test_log_loss_op.py b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
index 908f4bf94e510..25bede0af214b 100644
--- a/python/paddle/fluid/tests/unittests/test_log_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
@@ -17,8 +17,6 @@
 import numpy as np
 from op_test import OpTest
 
-import paddle.fluid as fluid
-
 
 def sigmoid_array(x):
     return 1 / (1 + np.exp(-x))
@@ -51,34 +49,5 @@ def test_check_grad(self):
         self.check_grad(['Predicted'], 'Loss', max_relative_error=0.03)
 
 
-class TestLogLossOpError(unittest.TestCase):
-    def test_errors(self):
-        with fluid.program_guard(fluid.Program()):
-
-            def test_x_type():
-                input_data = np.random.random(100, 1).astype("float32")
-                fluid.layers.log_loss(input_data)
-
-            self.assertRaises(TypeError, test_x_type)
-
-            def test_x_dtype():
-                x2 = fluid.layers.data(name='x2', shape=[100, 1], dtype='int32')
-                fluid.layers.log_loss(x2)
-
-            self.assertRaises(TypeError, test_x_dtype)
-
-            def test_label_type():
-                input_data = np.random.random(100, 1).astype("float32")
-                fluid.layers.log_loss(input_data)
-
-            self.assertRaises(TypeError, test_label_type)
-
-            def test_label_dtype():
-                x2 = fluid.layers.data(name='x2', shape=[100, 1], dtype='int32')
-                fluid.layers.log_loss(x2)
-
-            self.assertRaises(TypeError, test_label_dtype)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index 1849cfd395a55..9635811f6a818 100755
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -20,11 +20,11 @@
 from .common import conv3d  # noqa: F401
 from .common import conv2d_transpose  # noqa: F401
 from .common import conv3d_transpose  # noqa: F401
+from .common import bilinear_tensor_product  # noqa: F401
 from .common import py_func  # noqa: F401
 
 from ...tensor.creation import create_parameter  # noqa: F401
 from ...fluid.layers import batch_norm  # noqa: F401
-from ...fluid.layers import bilinear_tensor_product  # noqa: F401
 from ...fluid.layers import case  # noqa: F401
 from ...fluid.layers import cond  # noqa: F401
 from ...fluid.layers import conv2d  # noqa: F401
@@ -61,8 +61,8 @@
 __all__ = [  # noqa
     'fc',
     'batch_norm',
-    'embedding',
     'bilinear_tensor_product',
+    'embedding',
     'case',
     'cond',
     'conv2d',
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index a8dec018ff14a..420a00ddbdc51 100755
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -2088,6 +2088,184 @@ def deform_conv2d(
         )
 
 
+def bilinear_tensor_product(
+    x, y, size, act=None, name=None, param_attr=None, bias_attr=None
+):
+    r"""
+    This layer performs bilinear tensor product on two inputs.
+
+    .. math::
+
+       out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
+
+    In this formula:
+      - :math:`x`: the first input contains M elements, shape is [batch_size, M].
+      - :math:`y`: the second input contains N elements, shape is [batch_size, N].
+      - :math:`W_{i}`: the i-th learned weight, shape is [M, N].
+      - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size].
+      - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
+
+    Args:
+        x (Variable): 2-D input tensor with shape [batch_size, M]. Data type
+            is float32 or float64.
+        y (Variable): 2-D input tensor with shape [batch_size, N]. Data type
+            should be same as **x**.
+        size (int): The dimension of this layer.
+        act (str|None): Activation to be applied to the output of this layer. Default None.
+        name(str|None): For detailed information, please refer to
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
+        param_attr (ParamAttr|None): To specify the weight parameter attribute.
+            Default: None, which means the default weight parameter property is
+            used. See usage for details in :ref:`api_fluid_ParamAttr` .
+        bias_attr (ParamAttr|None): To specify the bias parameter attribute.
+            Default: None, which means the default bias parameter property is
+            used. See usage for details in :ref:`api_fluid_ParamAttr` .
+
+    Returns:
+        Tensor, A 2-D Tensor of shape [batch_size, size]. Data type is the same as input **x**.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.enable_static()
+
+            x = paddle.static.data("t1", shape=[-1, 5], dtype="float32")
+            y = paddle.static.data("t2", shape=[-1, 4], dtype="float32")
+            tensor = paddle.static.nn.bilinear_tensor_product(x, y, size=1000)
+
+    """
+    helper = LayerHelper('bilinear_tensor_product', **locals())
+    dtype = helper.input_dtype('x')
+
+    param_shape = [size, x.shape[1], y.shape[1]]
+
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=False
+    )
+    out = helper.create_variable_for_type_inference(dtype=dtype)
+
+    inputs = {"X": x, "Y": y, "Weight": w}
+    if helper.bias_attr:
+        bias_size = [1, size]
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True
+        )
+        inputs["Bias"] = bias
+    helper.append_op(
+        type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out}
+    )
+
+    # add activation
+    return helper.append_activation(out)
+
+
+@static_only
+def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
+    r"""
+
+    prelu activation.
+
+    .. math::
+        prelu(x) = max(0, x) + \alpha * min(0, x)
+
+    There are three modes for the activation:
+
+    .. code-block:: text
+
+        all: All elements share same alpha.
+        channel: Elements in same channel share same alpha.
+        element: All elements do not share alpha. Each element has its own alpha.
+
+    Parameters:
+        x (Tensor): The input Tensor or LoDTensor with data type float32.
+        mode (str): The mode for weight sharing.
+        param_attr (ParamAttr|None, optional): The parameter attribute for the learnable \
+            weight (alpha), it can be create by ParamAttr. None by default. \
+            For detailed information, please refer to :ref:`api_paddle_ParamAttr`.
+        data_format(str, optional): Data format that specifies the layout of input.
+            It may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Default: "NCHW".
+        name (str, optional): Name for the operation (optional, default is None). \
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: A tensor with the same shape and data type as x.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.enable_static()
+
+            x = paddle.static.data(name="x", shape=[None,5,10,10], dtype="float32")
+            mode = 'channel'
+            output = paddle.static.nn.prelu(
+                x,mode,param_attr=paddle.ParamAttr(name='alpha'))
+
+    """
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'prelu')
+
+    helper = LayerHelper('prelu', **locals())
+    if mode not in ['all', 'channel', 'element']:
+        raise ValueError('mode should be one of all, channel, element.')
+
+    alpha_shape = [1]
+    if mode == 'channel':
+
+        true_data_format = [
+            'NC',
+            'NCL',
+            'NCHW',
+            'NCDHW',
+            'NLC',
+            'NHWC',
+            'NDHWC',
+        ]
+        if data_format not in true_data_format:
+            raise ValueError(
+                "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', "
+                "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format)
+            )
+
+        data_format = 'NCHW' if data_format[1] == 'C' else 'NHWC'
+
+        assert (
+            len(x.shape) >= 2
+        ), "The size of input shape should be equal or larger than 2 in prelu() when mode is 'channel'"
+        # NOTE(zhiqiu): The alpha_shape should be [1, channel] + [1] * len(x.shape[2:]).
+        # To be consistent with Prelu, it is simplified.
+        # NOTE(zhiqiu): Revert shape to [1, channel, 1, 1] for compatibility with saved model of old version.
+        # NOTE(GuoxiaWang): support NHWC data format
+        if data_format == 'NHWC':
+            alpha_shape = [1, 1, 1, x.shape[-1]]
+        else:
+            alpha_shape = [1, x.shape[1], 1, 1]
+
+    elif mode == 'element':
+        assert (
+            len(x.shape) >= 1
+        ), "The size of input shape should be equal or larger than 1 in prelu() when mode is 'element'"
+        alpha_shape = [1] + list(x.shape)[1:]
+    dtype = helper.input_dtype(input_param_name='x')
+    alpha = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=alpha_shape,
+        dtype=dtype,
+        is_bias=False,
+        default_initializer=paddle.nn.initializer.Constant(0.25),
+    )
+
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type="prelu",
+        inputs={"X": x, 'Alpha': alpha},
+        attrs={"mode": mode, "data_format": data_format},
+        outputs={"Out": out},
+    )
+    return out
+
+
 class PyFuncRegistry:
     _register_funcs = []
 
@@ -2106,12 +2284,10 @@ def __init__(self, func):
         self._id = core._append_python_callable_object_and_return_id(self)
         '''
         Why record self here?
-
         1. For debug usage. Users can call
            :code:`py_func.registered_func(idx)` method
            to find the registered function corresponding
            to :code:`idx`.
-
         2. For increasing reference count of self.
            It seems that to release Python object
            whose reference count is 1 would cause
@@ -2169,25 +2345,20 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
     This is used to register customized Python OP to Paddle. The design
     principe of py_func is that Tensor and numpy array can be converted to each
     other easily. So you can use Python and numpy API to register a python OP.
-
     The forward function of the registered OP is ``func`` and the backward function
     of that is ``backward_func``. Paddle will call ``func`` at forward runtime and
     call ``backward_func`` at backward runtime(if ``backward_func`` is not  None).
     ``x`` is the input of ``func``, whose type must be Tensor; ``out`` is
     the output of ``func``, whose type can be either Tensor or numpy array.
-
     The input of the backward function ``backward_func`` is ``x``, ``out`` and
     the gradient of ``out``. If ``out`` have no gradient, the relevant input of
     ``backward_func`` is None. If ``x`` do not have a gradient, the user should
     return None in ``backward_func``.
-
     The data type and shape of ``out`` should also be set correctly before this
     API is called, and the data type and shape of the gradient of ``out`` and
     ``x`` will be inferred automatically.
-
     This API can also be used to debug the neural network by setting the ``func``
     as a function that only print variables.
-
     Args:
         func (callable): The forward function of the registered OP. When the network
             is running, the forward output ``out`` will be calculated according to this
@@ -2211,61 +2382,47 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
             that no tensors need to be removed from ``x`` and ``out``. If it is not None,
             these tensors will not be the input of ``backward_func``. This parameter is only
             useful when ``backward_func`` is not None.
-
     Returns:
         Tensor|tuple(Tensor)|list[Tensor]: The output ``out`` of the forward function ``func``.
-
     Examples:
         .. code-block:: python
-
             # example 1:
             import paddle
             import numpy as np
-
             paddle.enable_static()
-
             # Creates a forward function, Tensor can be input directly without
             # being converted into numpy array.
             def tanh(x):
                 return np.tanh(x)
-
             # Skip x in backward function and return the gradient of x
             # Tensor must be actively converted to numpy array, otherwise,
             # operations such as +/- can't be used.
             def tanh_grad(y, dy):
                 return np.array(dy) * (1 - np.square(np.array(y)))
-
             # Creates a forward function for debugging running networks(print value)
             def debug_func(x):
                 print(x)
-
             def create_tmp_var(name, dtype, shape):
                 return paddle.static.default_main_program().current_block().create_var(
                     name=name, dtype=dtype, shape=shape)
-
             def simple_net(img, label):
                 hidden = img
                 for idx in range(4):
                     hidden = paddle.static.nn.fc(hidden, size=200)
                     new_hidden = create_tmp_var(name='hidden_{}'.format(idx),
                         dtype=hidden.dtype, shape=hidden.shape)
-
                     # User-defined forward and backward
                     hidden = paddle.static.py_func(func=tanh, x=hidden,
                         out=new_hidden, backward_func=tanh_grad,
                         skip_vars_in_backward_input=hidden)
-
                     # User-defined debug functions that print out the input Tensor
                     paddle.static.py_func(func=debug_func, x=hidden, out=None)
-
                 prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax')
                 ce_loss = paddle.nn.loss.CrossEntropyLoss()
                 return ce_loss(prediction, label)
-
             x = paddle.static.data(name='x', shape=[1,4], dtype='float32')
             y = paddle.static.data(name='y', shape=[1], dtype='int64')
             res = simple_net(x, y)
-
             exe = paddle.static.Executor(paddle.CPUPlace())
             exe.run(paddle.static.default_startup_program())
             input1 = np.random.random(size=[1,4]).astype('float32')
@@ -2274,54 +2431,40 @@ def simple_net(img, label):
                           feed={'x':input1, 'y':input2},
                           fetch_list=[res.name])
             print(out)
-
         .. code-block:: python
-
             # example 2:
             # This example shows how to turn Tensor into numpy array and
             # use numpy API to register an Python OP
             import paddle
             import numpy as np
-
             paddle.enable_static()
-
             def element_wise_add(x, y):
                 # Tensor must be actively converted to numpy array, otherwise,
                 # numpy.shape can't be used.
                 x = np.array(x)
                 y = np.array(y)
-
                 if x.shape != y.shape:
                     raise AssertionError("the shape of inputs must be the same!")
-
                 result = np.zeros(x.shape, dtype='int32')
                 for i in range(len(x)):
                     for j in range(len(x[0])):
                         result[i][j] = x[i][j] + y[i][j]
-
                 return result
-
             def create_tmp_var(name, dtype, shape):
                 return paddle.static.default_main_program().current_block().create_var(
                             name=name, dtype=dtype, shape=shape)
-
             def py_func_demo():
                 start_program = paddle.static.default_startup_program()
                 main_program = paddle.static.default_main_program()
-
                 # Input of the forward function
                 x = paddle.static.data(name='x', shape=[2,3], dtype='int32')
                 y = paddle.static.data(name='y', shape=[2,3], dtype='int32')
-
                 # Output of the forward function, name/dtype/shape must be specified
                 output = create_tmp_var('output','int32', [3,1])
-
                 # Multiple Variable should be passed in the form of tuple(Variale) or list[Variale]
                 paddle.static.py_func(func=element_wise_add, x=[x,y], out=output)
-
                 exe=paddle.static.Executor(paddle.CPUPlace())
                 exe.run(start_program)
-
                 # Feed numpy array to main_program
                 input1 = np.random.randint(1, 10, size=[2,3], dtype='int32')
                 input2 = np.random.randint(1, 10, size=[2,3], dtype='int32')
@@ -2329,9 +2472,7 @@ def py_func_demo():
                             feed={'x':input1, 'y':input2},
                             fetch_list=[output.name])
                 print("{0} + {1} = {2}".format(input1, input2, out))
-
             py_func_demo()
-
             # Reference output:
             # [[5, 9, 9]   + [[7, 8, 4]  =  [array([[12, 17, 13]
             #  [7, 5, 2]]     [1, 3, 3]]            [8, 8, 5]], dtype=int32)]
@@ -2405,109 +2546,3 @@ def py_func_demo():
 # For debug usage
 py_func.registered_func = PyFuncRegistry.registered_func
 py_func.registered_func_num = PyFuncRegistry.registered_func_num
-
-
-@static_only
-def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
-    r"""
-
-    prelu activation.
-
-    .. math::
-        prelu(x) = max(0, x) + \alpha * min(0, x)
-
-    There are three modes for the activation:
-
-    .. code-block:: text
-
-        all: All elements share same alpha.
-        channel: Elements in same channel share same alpha.
-        element: All elements do not share alpha. Each element has its own alpha.
-
-    Parameters:
-        x (Tensor): The input Tensor or LoDTensor with data type float32.
-        mode (str): The mode for weight sharing.
-        param_attr (ParamAttr|None, optional): The parameter attribute for the learnable \
-            weight (alpha), it can be create by ParamAttr. None by default. \
-            For detailed information, please refer to :ref:`api_paddle_ParamAttr`.
-        data_format(str, optional): Data format that specifies the layout of input.
-            It may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Default: "NCHW".
-        name (str, optional): Name for the operation (optional, default is None). \
-            For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor: A tensor with the same shape and data type as x.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            paddle.enable_static()
-
-            x = paddle.static.data(name="x", shape=[None,5,10,10], dtype="float32")
-            mode = 'channel'
-            output = paddle.static.nn.prelu(
-                x,mode,param_attr=paddle.ParamAttr(name='alpha'))
-
-    """
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'prelu')
-
-    helper = LayerHelper('prelu', **locals())
-    if mode not in ['all', 'channel', 'element']:
-        raise ValueError('mode should be one of all, channel, element.')
-
-    alpha_shape = [1]
-    if mode == 'channel':
-
-        true_data_format = [
-            'NC',
-            'NCL',
-            'NCHW',
-            'NCDHW',
-            'NLC',
-            'NHWC',
-            'NDHWC',
-        ]
-        if data_format not in true_data_format:
-            raise ValueError(
-                "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', "
-                "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format)
-            )
-
-        data_format = 'NCHW' if data_format[1] == 'C' else 'NHWC'
-
-        assert (
-            len(x.shape) >= 2
-        ), "The size of input shape should be equal or larger than 2 in prelu() when mode is 'channel'"
-        # NOTE(zhiqiu): The alpha_shape should be [1, channel] + [1] * len(x.shape[2:]).
-        # To be consistent with Prelu, it is simplified.
-        # NOTE(zhiqiu): Revert shape to [1, channel, 1, 1] for compatibility with saved model of old version.
-        # NOTE(GuoxiaWang): support NHWC data format
-        if data_format == 'NHWC':
-            alpha_shape = [1, 1, 1, x.shape[-1]]
-        else:
-            alpha_shape = [1, x.shape[1], 1, 1]
-
-    elif mode == 'element':
-        assert (
-            len(x.shape) >= 1
-        ), "The size of input shape should be equal or larger than 1 in prelu() when mode is 'element'"
-        alpha_shape = [1] + list(x.shape)[1:]
-    dtype = helper.input_dtype(input_param_name='x')
-    alpha = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=alpha_shape,
-        dtype=dtype,
-        is_bias=False,
-        default_initializer=paddle.nn.initializer.Constant(0.25),
-    )
-
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="prelu",
-        inputs={"X": x, 'Alpha': alpha},
-        attrs={"mode": mode, "data_format": data_format},
-        outputs={"Out": out},
-    )
-    return out

From 61a1f68845f65a12723ac2a667d083a1ab27399e Mon Sep 17 00:00:00 2001
From: Guanghua Yu <742925032@qq.com>
Date: Mon, 5 Dec 2022 15:14:18 +0800
Subject: [PATCH 12/13] Support matmul in QAT and loading quantized models in
 PTQ (#47892)

---
 .../slim/quantization/imperative/utils.py      |  1 +
 .../slim/quantization/quantization_pass.py     | 18 ++++++++++++++++++
 python/paddle/nn/quant/__init__.py             |  1 +
 python/paddle/nn/quant/functional_layers.py    | 10 +++++++++-
 4 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index d771b51e09d11..e5ed14cb9f1e1 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -63,6 +63,7 @@
     paddle.nn.quant.subtract,
     paddle.nn.quant.multiply,
     paddle.nn.quant.divide,
+    paddle.nn.quant.matmul,
 ]
 
 fake_quant_leaf_layers = [
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 6d99f0949d4a7..705b0e5e69ee6 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -1939,6 +1939,15 @@ def apply(self, graph):
                     op_node.op()._set_attr("activation_bits", self._quant_bits)
                     op_node.op()._set_attr("with_quant_attr", True)
                     arg_names = utils._get_op_input_var_names(op_node)
+                    # If already quanted, skip it.
+                    skip_quant = False
+                    for arg_name in arg_names:
+                        if "quantized.dequantized" in arg_name:
+                            skip_quant = True
+                            break
+                    if skip_quant:
+                        continue
+
                     for arg_name in arg_names:
                         in_node = graph._find_node_by_name(
                             op_node.inputs, arg_name
@@ -2797,6 +2806,15 @@ def apply(self, graph):
                         continue
 
                     arg_names = utils._get_op_input_var_names(op_node)
+                    # If already quanted, skip it.
+                    skip_quant = False
+                    for arg_name in arg_names:
+                        if "quantized.dequantized" in arg_name:
+                            skip_quant = True
+                            break
+                    if skip_quant:
+                        continue
+
                     for arg_name in arg_names:
                         in_node = graph._find_node_by_name(
                             op_node.inputs, arg_name
diff --git a/python/paddle/nn/quant/__init__.py b/python/paddle/nn/quant/__init__.py
index 8973761ab6944..f96558bfbed15 100644
--- a/python/paddle/nn/quant/__init__.py
+++ b/python/paddle/nn/quant/__init__.py
@@ -21,6 +21,7 @@
 from .functional_layers import transpose  # noqa: F401
 from .functional_layers import concat  # noqa: F401
 from .functional_layers import flatten  # noqa: F401
+from .functional_layers import matmul  # noqa: F401
 from .quant_layers import QuantStub  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/nn/quant/functional_layers.py b/python/paddle/nn/quant/functional_layers.py
index 2986e3e0500f9..3a0fafe6b6ad1 100644
--- a/python/paddle/nn/quant/functional_layers.py
+++ b/python/paddle/nn/quant/functional_layers.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...tensor import manipulation, math
+from ...tensor import linalg, manipulation, math
 from .. import Layer
 
 __all__ = []
@@ -85,3 +85,11 @@ def __init__(self):
 
     def forward(self, x, start_axis=0, stop_axis=-1, name=None):
         return manipulation.flatten(x, start_axis, stop_axis, name)
+
+
+class matmul(FloatFunctionalLayer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y, transpose_x=False, transpose_y=False, name=None):
+        return linalg.matmul(x, y, transpose_x, transpose_y, name)

From 97aa938f2bed793e17e67cb6cf08e52307532288 Mon Sep 17 00:00:00 2001
From: lzydev <1528794076@qq.com>
Date: Mon, 5 Dec 2022 15:39:51 +0800
Subject: [PATCH 13/13] Generate static graph code of some ops by yaml (#48698)

* generate static graph code of some ops by yaml, test = develop

* generate static graph code of some ops by yaml, test = develop
---
 paddle/fluid/operators/lu_unpack_op.cc   | 142 ----------------------
 paddle/fluid/operators/mode_op.cc        | 123 -------------------
 paddle/fluid/operators/nll_loss_op.cc    | 148 -----------------------
 paddle/fluid/operators/qr_op.cc          | 120 ------------------
 paddle/fluid/operators/renorm_op.cc      |  89 --------------
 paddle/phi/api/yaml/backward.yaml        |  50 ++++++++
 paddle/phi/api/yaml/legacy_backward.yaml |  50 --------
 paddle/phi/api/yaml/legacy_ops.yaml      |  49 --------
 paddle/phi/api/yaml/op_compat.yaml       |  32 +++++
 paddle/phi/api/yaml/ops.yaml             |  49 ++++++++
 paddle/phi/ops/compat/lu_unpack_sig.cc   |  37 ------
 paddle/phi/ops/compat/mode_sig.cc        |  34 ------
 paddle/phi/ops/compat/nll_loss_sig.cc    |  38 ------
 paddle/phi/ops/compat/qr_sig.cc          |  31 -----
 paddle/phi/ops/compat/renorm_sig.cc      |  34 ------
 15 files changed, 131 insertions(+), 895 deletions(-)
 delete mode 100644 paddle/fluid/operators/lu_unpack_op.cc
 delete mode 100644 paddle/fluid/operators/mode_op.cc
 delete mode 100644 paddle/fluid/operators/nll_loss_op.cc
 delete mode 100644 paddle/fluid/operators/qr_op.cc
 delete mode 100644 paddle/fluid/operators/renorm_op.cc
 delete mode 100644 paddle/phi/ops/compat/lu_unpack_sig.cc
 delete mode 100644 paddle/phi/ops/compat/mode_sig.cc
 delete mode 100644 paddle/phi/ops/compat/nll_loss_sig.cc
 delete mode 100644 paddle/phi/ops/compat/qr_sig.cc
 delete mode 100644 paddle/phi/ops/compat/renorm_sig.cc

diff --git a/paddle/fluid/operators/lu_unpack_op.cc b/paddle/fluid/operators/lu_unpack_op.cc
deleted file mode 100644
index 9f631a60c1556..0000000000000
--- a/paddle/fluid/operators/lu_unpack_op.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#include "paddle/phi/infermeta/backward.h"
-#include "paddle/phi/infermeta/binary.h"
-
-namespace paddle {
-namespace operators {
-
-class LU_UnpackOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddComment(R"DOC(Unpack L U and P to single matrix tensor,
-                unpack L and U matrix from LU, unpack permutation matrix Pmat from Pivtos .
-                )DOC");
-    AddInput("X", "(Tensor) The input LU tensor, shape of (*,m,n)");
-    AddInput("Pivots",
-             "(Tensor) The input Pivots tensor, shape of (*,min(m,n))");
-    AddOutput(
-        "Pmat",
-        "(Tensor) The output permutation matrix tensor, shape of (*, m, m)");
-    AddOutput("L", "(Tensor) The output lower triangular matrix tensor");
-    AddOutput("U", "(Tensor) The output upper triangular matrix tensor");
-    AddAttr<bool>("unpack_ludata", "Whether to unpack L and U")
-        .SetDefault(true);
-    AddAttr<bool>("unpack_pivots", "Whether to unpack permutation matrix")
-        .SetDefault(true);
-  }
-};
-
-class LU_UnpackOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
-  }
-};
-
-class LU_UnpackOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto var_type = ctx->GetInputType("X", 0);
-    auto data_type = ctx->GetInputDataType("X", 0);
-
-    ctx->SetOutputType("L", var_type, framework::ALL_ELEMENTS);
-    ctx->SetOutputDataType("L", data_type, framework::ALL_ELEMENTS);
-
-    ctx->SetOutputType("U", var_type, framework::ALL_ELEMENTS);
-    ctx->SetOutputDataType("U", data_type, framework::ALL_ELEMENTS);
-
-    ctx->SetOutputType("Pmat", var_type, framework::ALL_ELEMENTS);
-    ctx->SetOutputDataType("Pmat", data_type, framework::ALL_ELEMENTS);
-  }
-};
-
-template <typename T>
-class LU_UnpackOpGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> retv) const override {
-    retv->SetType("lu_unpack_grad");
-    retv->SetInput("X", this->Input("X"));
-    retv->SetInput("Pivots", this->Input("Pivots"));
-    retv->SetInput("L", this->Output("L"));
-    retv->SetInput("U", this->Output("U"));
-    retv->SetInput("Pmat", this->Output("Pmat"));
-
-    retv->SetInput(framework::GradVarName("L"), this->OutputGrad("L"));
-    retv->SetInput(framework::GradVarName("U"), this->OutputGrad("U"));
-    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    retv->SetAttrMap(this->Attrs());
-  }
-};
-
-class LU_UnpackGradOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto var_type = ctx->GetInputType("X", 0);
-    auto data_type = ctx->GetInputDataType("X", 0);
-
-    ctx->SetOutputType(
-        framework::GradVarName("X"), var_type, framework::ALL_ELEMENTS);
-    ctx->SetOutputDataType(
-        framework::GradVarName("X"), data_type, framework::ALL_ELEMENTS);
-  }
-};
-
-class LU_UnpackGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return framework::OpKernelType(dtype, ctx.GetPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-DECLARE_INFER_SHAPE_FUNCTOR(lu_unpack,
-                            LUUnpackInferMetaFunctor,
-                            PD_INFER_META(phi::LUUnpackInferMeta));
-DECLARE_INFER_SHAPE_FUNCTOR(lu_unpack_grad,
-                            LUUnpackGradInferMetaFunctor,
-                            PD_INFER_META(phi::LUUnpackGradInferMeta));
-
-REGISTER_OPERATOR(lu_unpack,
-                  ops::LU_UnpackOp,
-                  ops::LU_UnpackOpMaker,
-                  ops::LU_UnpackOpVarTypeInference,
-                  ops::LU_UnpackOpGradMaker<paddle::framework::OpDesc>,
-                  ops::LU_UnpackOpGradMaker<paddle::imperative::OpBase>,
-                  LUUnpackInferMetaFunctor);
-REGISTER_OPERATOR(lu_unpack_grad,
-                  ops::LU_UnpackGradOp,
-                  ops::LU_UnpackGradOpVarTypeInference,
-                  LUUnpackGradInferMetaFunctor);
diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc
deleted file mode 100644
index 472526623511e..0000000000000
--- a/paddle/fluid/operators/mode_op.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class ModeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-    phi::DataLayout layout_ = phi::DataLayout::kAnyLayout;
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context(),
-        layout_,
-        library_);
-  }
-};
-
-class ModeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) The input of Mode op");
-    AddOutput("Out", "(Tensor) The output tensor of Topk op");
-    AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
-    AddAttr<int>("axis",
-                 "the axis to calculate mode values."
-                 "if not set, will calculate on last axis.")
-        .SetDefault(-1);
-    AddAttr<bool>("keepdim", "Keep the dim that to reduce.").SetDefault(false);
-    AddComment(R"DOC(
-This operator finds the mode of input Tensor. And outputs their values and indices as vectors.
-)DOC");
-  }
-};
-
-class ModeOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"),
-        true,
-        platform::errors::InvalidArgument("Input(X) should be not null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Indices"),
-        true,
-        platform::errors::InvalidArgument("Input(Indices) should be not null"));
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Grad Input(Out) should be not null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput(framework::GradVarName("X")),
-        true,
-        platform::errors::InvalidArgument("Grad Output(X) should be not null"));
-
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(
-        ctx, framework::GradVarName("Out"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-template <typename T>
-class ModeGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("mode_grad");
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Indices", this->Output("Indices"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(mode,
-                            ModeInferShapeFunctor,
-                            PD_INFER_META(phi::ModeInferMeta));
-REGISTER_OPERATOR(mode,
-                  ops::ModeOp,
-                  ops::ModeOpMaker,
-                  ops::ModeGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ModeGradOpMaker<paddle::imperative::OpBase>,
-                  ModeInferShapeFunctor);
-REGISTER_OPERATOR(mode_grad, ops::ModeOpGrad);
diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc
deleted file mode 100644
index 782b67d90e81f..0000000000000
--- a/paddle/fluid/operators/nll_loss_op.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/infermeta/backward.h"
-#include "paddle/phi/infermeta/ternary.h"
-
-namespace paddle {
-namespace operators {
-
-class NLLLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
-  }
-};
-
-class NLLLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>) A tensor whose last dimension "
-             "size is equal to the number of classes. It  is expected to "
-             "contain log-probabilities of each class. "
-             "The X tensor's shape has to be either [batch_size, C] or"
-             "[batch_size, C, dim1, ..., dimK] in with K >= 1 in the case "
-             " K-dimensional loss.");
-    AddInput("Label",
-             "(Tensor, default Tensor<int64_t>) A tensor which represents the "
-             "the ground truth. It contains the class index in the range "
-             "[0, C-1] where C = number of classes. The Lable tensor's "
-             "shape has to be (batch_size), or "
-             "(batch_size, dim1, ..., dimK) "
-             "with K >= 1 in the case K-dimensional loss.");
-    AddInput("Weight",
-             "(Tensor, optional) A tensor should be a 1D tensor assigning "
-             "weight to each of the classes. It's shape must be [C], where "
-             "C is the class number.")
-        .AsDispensable();
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>) A tensor that represents the "
-              "NLL loss.");
-    AddOutput("Total_weight",
-              "(Tensor, default Tensor<float>) A tensor saves the total"
-              "weight value in the forward process.");
-    AddAttr<int64_t>("ignore_index",
-                     "(int64_t, default -100), Specifies a target value that is"
-                     "ignored and does not contribute to the input gradient.")
-        .SetDefault(-100);
-    AddAttr<std::string>(
-        "reduction",
-        "(string, default mean), Specifies the reduction to apply"
-        "to the output. The options include \"none\", \"mean\","
-        "\"sum\".")
-        .SetDefault("mean");
-    AddComment(R"DOC(
-NLL(Negative Log Likelihood) Loss Operator.
-
-This operator computes the NLL loss according to the inputs.
-The loss can be described as:
-
-$Out[i] = -X[Label[i]]*Weight[Label[i]]$
-
-It can also be used for higher dimension inputs, such as 2D images, by
-providing an input of shape (batch_size, C, d1, d2, ..., dK), with
-K >= 1, where K is the number of dimensions, and a Label of
-appropriate shape. In the case of images, it computes NLL loss
-per-pixel.
-
-)DOC");
-  }
-};
-
-class NLLLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
-  }
-};
-
-template <typename T>
-class NLLLossGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("nll_loss_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Label", this->Input("Label"));
-    op->SetInput("Total_weight", this->Output("Total_weight"));
-
-    if (this->HasInput("Weight")) {
-      op->SetInput("Weight", this->Input("Weight"));
-    }
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-DECLARE_INFER_SHAPE_FUNCTOR(nll_loss,
-                            NllLossRawInferShapeFunctor,
-                            PD_INFER_META(phi::NllLossRawInferMeta));
-DECLARE_INFER_SHAPE_FUNCTOR(nll_loss_grad,
-                            NllLossGradInferShapeFunctor,
-                            PD_INFER_META(phi::NllLossGradInferMeta));
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(nll_loss,
-                  ops::NLLLossOp,
-                  ops::NLLLossOpMaker,
-                  ops::NLLLossGradMaker<paddle::framework::OpDesc>,
-                  ops::NLLLossGradMaker<paddle::imperative::OpBase>,
-                  NllLossRawInferShapeFunctor);
-REGISTER_OPERATOR(nll_loss_grad,
-                  ops::NLLLossGradOp,
-                  NllLossGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc
deleted file mode 100644
index 3eac56d1604b9..0000000000000
--- a/paddle/fluid/operators/qr_op.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-using DDim = framework::DDim;
-
-class QrOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-class QrOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of qr op.");
-    AddOutput("Q", "(Tensor), The output Q tensor of qr op.");
-    AddOutput("R", "(Tensor), The output R tensor of qr op.");
-    AddAttr<std::string>(
-        "mode",
-        "(string, default \"reduced\"). "
-        "If mode is \"reduced\", Qr op will return reduced Q and R matrices. "
-        "If mode is \"complete\", Qr op will return complete Q and R matrices. "
-        "If mode is \"r\", Qr op will only return reduced R matrix.")
-        .SetDefault("reduced");
-    AddComment(R"DOC(
-Qr Operator.
-This operator is used to perform QR operation for batched matrics $X$.
-$$Q, R = qr(X)$$
-)DOC");
-  }
-};
-
-class QrGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Q")),
-                   "Input",
-                   "Q@Grad",
-                   "QrGrad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("R")),
-                   "Input",
-                   "R@Grad",
-                   "QrGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Q"), "Input", "Q", "QrGrad");
-    OP_INOUT_CHECK(ctx->HasInput("R"), "Input", "R", "QrGrad");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   "X@Grad",
-                   "QrGrad");
-
-    auto x_dims = ctx->GetInputDim(("X"));
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return framework::OpKernelType(dtype, ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class QrGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> retv) const override {
-    retv->SetType("qr_grad");
-    retv->SetInput(framework::GradVarName("Q"), this->OutputGrad("Q"));
-    retv->SetInput(framework::GradVarName("R"), this->OutputGrad("R"));
-    retv->SetInput("Q", this->Output("Q"));
-    retv->SetInput("R", this->Output("R"));
-    retv->SetInput("X", this->Input("X"));
-    retv->SetAttrMap(this->Attrs());
-    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(qr,
-                            QrInferShapeFunctor,
-                            PD_INFER_META(phi::QrInferMeta));
-
-REGISTER_OPERATOR(qr,
-                  ops::QrOp,
-                  ops::QrOpMaker,
-                  ops::QrGradMaker<paddle::framework::OpDesc>,
-                  ops::QrGradMaker<paddle::imperative::OpBase>,
-                  QrInferShapeFunctor);
-
-REGISTER_OPERATOR(qr_grad, ops::QrGradOp);
diff --git a/paddle/fluid/operators/renorm_op.cc b/paddle/fluid/operators/renorm_op.cc
deleted file mode 100644
index 1dc333460b6ed..0000000000000
--- a/paddle/fluid/operators/renorm_op.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class RenormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  using DDim = paddle::framework::DDim;
-};
-
-class RenormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of renorm op.");
-    AddOutput("Out", "(Tensor), The output tensor of renorm op.");
-    AddAttr<float>("p", "(float, norm's power");
-    AddAttr<int>("axis",
-                 "int,the dimension to slice over to get the sub-tensors");
-    AddAttr<float>("max_norm", "(float, the norm upper-bound");
-    AddComment(R"DOC(
-Renorm Operator.
-
-This operator is used to scale tensor sliced by axis if its p-norm execeeds maxnorm
-
-)DOC");
-  }
-};
-
-class RenormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-template <typename T>
-class RenormGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> retv) const override {
-    retv->SetType("renorm_grad");
-    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    retv->SetInput("X", this->Input("X"));
-    retv->SetAttrMap(this->Attrs());
-    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(renorm,
-                            RenormInferShapeFunctor,
-                            PD_INFER_META(phi::UnchangedInferMeta));
-
-DECLARE_INFER_SHAPE_FUNCTOR(renorm_grad,
-                            RenormGradInferShapeFunctor,
-                            PD_INFER_META(phi::UnchangedInferMeta));
-
-REGISTER_OPERATOR(renorm,
-                  ops::RenormOp,
-                  ops::RenormOpMaker,
-                  ops::RenormGradMaker<paddle::framework::OpDesc>,
-                  ops::RenormGradMaker<paddle::imperative::OpBase>,
-                  RenormInferShapeFunctor)
-
-REGISTER_OPERATOR(renorm_grad, ops::RenormGradOp, RenormGradInferShapeFunctor);
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 2d333805b5aa0..3706935624dd5 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -687,6 +687,15 @@
     func : logsigmoid_grad
   inplace : (out_grad -> x_grad)
 
+- backward_op : lu_unpack_grad
+  forward : lu_unpack (Tensor x, Tensor y, bool unpack_ludata = true, bool unpack_pivots = true) -> Tensor(pmat), Tensor(l), Tensor(u)
+  args : (Tensor x, Tensor y, Tensor l, Tensor u, Tensor pmat, Tensor l_grad, Tensor u_grad, bool unpack_ludata, bool unpack_pivots)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : LUUnpackGradInferMeta
+  kernel :
+    func : lu_unpack_grad
+
 - backward_op : masked_select_grad
   forward : masked_select (Tensor x, Tensor mask) -> Tensor(out)
   args : (Tensor x, Tensor mask, Tensor out_grad)
@@ -719,6 +728,16 @@
   kernel :
     func : maxout_grad
 
+- backward_op : mode_grad
+  forward : mode(Tensor x,  int axis = -1,  bool keepdim = false) -> Tensor(out), Tensor(indices)
+  args : (Tensor x, Tensor indices, Tensor out_grad,  int axis,  bool keepdim)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : mode_grad
+
 - backward_op : mv_grad
   forward : mv (Tensor x, Tensor vec) -> Tensor(out)
   args : (Tensor x, Tensor vec, Tensor out_grad)
@@ -729,6 +748,17 @@
   kernel :
     func : mv_grad
 
+- backward_op : nll_loss_grad
+  forward : nll_loss (Tensor input, Tensor label, Tensor weight, int64_t ignore_index = -100, str reduction = "mean") -> Tensor(out), Tensor(total_weight)
+  args : (Tensor input, Tensor label, Tensor weight, Tensor total_weight, Tensor out_grad, int64_t ignore_index, str reduction)
+  output : Tensor(input_grad)
+  infer_meta :
+    func : NllLossGradInferMeta
+  kernel :
+    func : nll_loss_grad
+    data_type : input
+  optional : weight
+
 - backward_op : poisson_grad
   forward : poisson (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
@@ -739,6 +769,16 @@
   kernel :
     func : poisson_grad
 
+- backward_op : qr_grad
+  forward : qr (Tensor x, str mode = "reduced") -> Tensor(q), Tensor(r)
+  args : (Tensor x, Tensor q, Tensor r, Tensor q_grad, Tensor r_grad, str mode)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : qr_grad
+
 - backward_op : reciprocal_grad
   forward : reciprocal (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
@@ -773,6 +813,16 @@
   backward: relu_double_grad
   inplace : (out_grad -> x_grad)
 
+- backward_op : renorm_grad
+  forward : renorm (Tensor x, float p, int axis, float max_norm) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, float p, int axis, float max_norm)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out_grad]
+  kernel :
+    func : renorm_grad
+
 - backward_op : round_grad
   forward : round(Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index b0ce57461685e..76e5720a8e5cf 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -804,15 +804,6 @@
   kernel :
     func : lu_grad
 
-- backward_op : lu_unpack_grad
-  forward : lu_unpack (Tensor x, Tensor y, bool unpack_ludata, bool unpack_pivots) -> Tensor(pmat), Tensor(l), Tensor(u)
-  args : (Tensor x, Tensor y, Tensor l, Tensor u, Tensor pmat, Tensor l_grad, Tensor u_grad, bool unpack_ludata, bool unpack_pivots)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : LUUnpackGradInferMeta
-  kernel :
-    func : lu_unpack_grad
-
 - backward_op : margin_cross_entropy_grad
   forward : margin_cross_entropy (Tensor logits, Tensor label, bool return_softmax, int ring_id, int rank, int nranks, float margin1, float margin2, float margin3, float scale) -> Tensor(softmax), Tensor(loss)
   args : (Tensor logits, Tensor label, Tensor softmax, Tensor loss_grad, bool return_softmax, int ring_id, int rank, int nranks, float margin1, float margin2, float margin3, float scale)
@@ -964,16 +955,6 @@
     func : mish_grad
   inplace : (out_grad -> x_grad)
 
-- backward_op : mode_grad
-  forward : mode(Tensor x,  int axis,  bool keepdim) -> Tensor(out), Tensor(indices)
-  args : (Tensor x, Tensor indices, Tensor out_grad,  int axis,  bool keepdim)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param: [x]
-  kernel :
-    func : mode_grad
-
 - backward_op : multi_dot_grad
   forward : multi_dot (Tensor[] x) -> Tensor(out)
   args : (Tensor[] x, Tensor out_grad)
@@ -1041,17 +1022,6 @@
     func : nearest_interp_grad
     data_type : output_grad
 
-- backward_op : nll_loss_grad
-  forward : nll_loss (Tensor input, Tensor label, Tensor weight, int64_t ignore_index, str reduction) -> Tensor(out), Tensor(total_weight)
-  args : (Tensor input, Tensor label, Tensor weight, Tensor total_weight, Tensor out_grad, int64_t ignore_index, str reduction)
-  output : Tensor(input_grad)
-  infer_meta :
-    func : NllLossGradInferMeta
-  kernel :
-    func : nll_loss_grad
-    data_type : input
-  optional : weight
-
 - backward_op : norm_grad
   forward : norm (Tensor x, int axis, float epsilon, bool is_test) -> Tensor(out), Tensor(norm)
   args : (Tensor x, Tensor norm, Tensor out_grad, int axis, float epsilon, bool is_test)
@@ -1246,16 +1216,6 @@
   kernel :
     func : put_along_axis_grad
 
-- backward_op : qr_grad
-  forward : qr (Tensor x, str mode) -> Tensor(q), Tensor(r)
-  args : (Tensor x, Tensor q, Tensor r, Tensor q_grad, Tensor r_grad, str mode)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [x]
-  kernel :
-    func : qr_grad
-
 - backward_op : real_grad
   forward : real (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
@@ -1273,16 +1233,6 @@
     func : relu6_grad
   inplace : (out_grad -> x_grad)
 
-- backward_op : renorm_grad
-  forward : renorm (Tensor x, float p, int axis, float max_norm) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad, float p, int axis, float max_norm)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [out_grad]
-  kernel :
-    func : renorm_grad
-
 - backward_op : repeat_interleave_grad
   forward : repeat_interleave(Tensor x, int repeats, int axis) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int repeats, int axis)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index d32a853b8c094..1b2ee0f7e0aac 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1164,16 +1164,6 @@
     func : lu
   backward : lu_grad
 
-- op : lu_unpack
-  args : (Tensor x, Tensor y, bool unpack_ludata, bool unpack_pivots)
-  output : Tensor(pmat), Tensor(l), Tensor(u)
-  infer_meta :
-    func : LUUnpackInferMeta
-  kernel :
-    func : lu_unpack
-    data_type : x
-  backward : lu_unpack_grad
-
 - op : margin_cross_entropy
   args : (Tensor logits, Tensor label, bool return_softmax, int ring_id, int rank, int nranks, float margin1, float margin2, float margin3, float scale)
   output : Tensor(softmax), Tensor(loss)
@@ -1339,15 +1329,6 @@
     func : mish
   backward : mish_grad
 
-- op : mode
-  args : (Tensor x,  int axis,  bool keepdim)
-  output : Tensor(out), Tensor(indices)
-  infer_meta :
-    func : ModeInferMeta
-  kernel :
-    func : mode
-  backward : mode_grad
-
 - op : momentum_
   args : (Tensor param, Tensor grad, Tensor velocity, Tensor learning_rate, Tensor master_param, float mu, bool use_nesterov = false, str regularization_method = "", float regularization_coeff = 0.0, bool multi_precision = false, float rescale_grad = 1.0f)
   output : Tensor(param_out), Tensor(velocity_out), Tensor(master_param_out)
@@ -1416,17 +1397,6 @@
     data_type : x
   backward : nearest_interp_grad
 
-- op : nll_loss
-  args : (Tensor input, Tensor label, Tensor weight, int64_t ignore_index, str reduction)
-  output : Tensor(out), Tensor(total_weight)
-  infer_meta :
-    func : NllLossRawInferMeta
-  kernel :
-    func : nll_loss
-    data_type : input
-  optional : weight
-  backward : nll_loss_grad
-
 - op : nms
   args : (Tensor x, float threshold)
   output : Tensor(out)
@@ -1615,15 +1585,6 @@
   inplace : (arr -> out)
   backward : put_along_axis_grad
 
-- op : qr
-  args : (Tensor x, str mode)
-  output : Tensor(q), Tensor(r)
-  infer_meta :
-    func : QrInferMeta
-  kernel :
-    func : qr
-  backward : qr_grad
-
 - op : randint
   args : (int low, int high, IntArray shape, DataType dtype=DataType::INT64, Place place={})
   output : Tensor(out)
@@ -1676,16 +1637,6 @@
     func : remainder
   inplace : (x -> out)
 
-- op : renorm
-  args : (Tensor x, float p, int axis, float max_norm)
-  output : Tensor
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [x]
-  kernel :
-    func : renorm
-  backward : renorm_grad
-
 - op : repeat_interleave
   args : (Tensor x, int repeats, int axis)
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 5640ca7eb8b0f..843ff811f9114 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -761,6 +761,13 @@
   extra :
     attrs : [bool use_mkldnn = false, bool is_test = false]
 
+- op : lu_unpack
+  backward : lu_unpack_grad
+  inputs :
+    {x : X, y : Pivots}
+  outputs :
+    {pmat : Pmat, l : L, u : U}
+
 - op : masked_select
   inputs :
     {x : X, mask : Mask}
@@ -809,6 +816,13 @@
   extra :
     attrs : [bool use_mkldnn = false]
 
+- op : mode
+  backward : mode_grad
+  inputs :
+    x : X
+  outputs :
+    {out : Out, indices : Indices}
+
 - op : multiply (elementwise_mul)
   backward : multiply_grad (elementwise_mul_grad)
   extra :
@@ -832,6 +846,13 @@
   extra :
     attrs : [bool use_mkldnn = false]
 
+- op : nll_loss
+  backward : nll_loss_grad
+  inputs :
+    {input : X, label : Label, weight : Weight}
+  outputs :
+    {out : Out, total_weight : Total_weight}
+
 - op : pad2d
   backward : pad2d_grad
   extra :
@@ -869,6 +890,13 @@
   extra :
     attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
 
+- op : qr
+  backward : qr_grad
+  inputs :
+    x : X
+  outputs :
+    {q : Q, r : R}
+
 - op : quantize_linear
   extra :
     attrs : [float moving_rate = 0.9]
@@ -946,6 +974,10 @@
 
 - op : renorm
   backward : renorm_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
   extra :
     attrs : [bool use_mkldnn = false, bool use_cudnn = false]
 
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 10b6645c61667..2445a2650d842 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -637,6 +637,16 @@
     func : logsigmoid
   backward : logsigmoid_grad
 
+- op : lu_unpack
+  args : (Tensor x, Tensor y, bool unpack_ludata = true, bool unpack_pivots = true)
+  output : Tensor(pmat), Tensor(l), Tensor(u)
+  infer_meta :
+    func : LUUnpackInferMeta
+  kernel :
+    func : lu_unpack
+    data_type : x
+  backward : lu_unpack_grad
+
 - op : masked_select
   args : (Tensor x, Tensor mask)
   output : Tensor (out)
@@ -665,6 +675,15 @@
     func : maxout
   backward : maxout_grad
 
+- op : mode
+  args : (Tensor x,  int axis = -1,  bool keepdim = false)
+  output : Tensor(out), Tensor(indices)
+  infer_meta :
+    func : ModeInferMeta
+  kernel :
+    func : mode
+  backward : mode_grad
+
 - op : mv
   args : (Tensor x, Tensor vec)
   output : Tensor
@@ -674,6 +693,17 @@
     func : mv
   backward : mv_grad
 
+- op : nll_loss
+  args : (Tensor input, Tensor label, Tensor weight, int64_t ignore_index = -100, str reduction = "mean")
+  output : Tensor(out), Tensor(total_weight)
+  infer_meta :
+    func : NllLossRawInferMeta
+  kernel :
+    func : nll_loss
+    data_type : input
+  optional : weight
+  backward : nll_loss_grad
+
 - op : npu_identity
   args : (Tensor x, int format = -1)
   output : Tensor
@@ -692,6 +722,15 @@
     func : poisson
   backward : poisson_grad
 
+- op : qr
+  args : (Tensor x, str mode = "reduced")
+  output : Tensor(q), Tensor(r)
+  infer_meta :
+    func : QrInferMeta
+  kernel :
+    func : qr
+  backward : qr_grad
+
 - op : reciprocal
   args : (Tensor x)
   output : Tensor(out)
@@ -712,6 +751,16 @@
   inplace : (x -> out)
   backward : relu_grad
 
+- op : renorm
+  args : (Tensor x, float p, int axis, float max_norm)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : renorm
+  backward : renorm_grad
+
 - op : round
   args : (Tensor x)
   output : Tensor(out)
diff --git a/paddle/phi/ops/compat/lu_unpack_sig.cc b/paddle/phi/ops/compat/lu_unpack_sig.cc
deleted file mode 100644
index 8baafe4fcb23a..0000000000000
--- a/paddle/phi/ops/compat/lu_unpack_sig.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature LUUnpackOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("lu_unpack",
-                         {"X", "Pivots"},
-                         {"unpack_ludata", "unpack_pivots"},
-                         {"Pmat", "L", "U"});
-}
-
-KernelSignature LUUnpackGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("lu_unpack_grad",
-                         {"X", "Pivots", "L", "U", "Pmat", "L@GRAD", "U@GRAD"},
-                         {"unpack_ludata", "unpack_pivots"},
-                         {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(lu_unpack, phi::LUUnpackOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(lu_unpack_grad, phi::LUUnpackGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/mode_sig.cc b/paddle/phi/ops/compat/mode_sig.cc
deleted file mode 100644
index e21cd69bf60a1..0000000000000
--- a/paddle/phi/ops/compat/mode_sig.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature ModeOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "mode", {"X"}, {"axis", "keepdim"}, {"Out", "Indices"});
-}
-
-KernelSignature ModeGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("mode_grad",
-                         {"X", "Indices", "Out@GRAD"},
-                         {"axis", "keepdim"},
-                         {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(mode, phi::ModeOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(mode_grad, phi::ModeGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/nll_loss_sig.cc b/paddle/phi/ops/compat/nll_loss_sig.cc
deleted file mode 100644
index f3f9c53178192..0000000000000
--- a/paddle/phi/ops/compat/nll_loss_sig.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature NllLossOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  // TODO(xiongkun): can't remove the forward mapping, because the Weight is
-  // optional
-  return KernelSignature("nll_loss",
-                         {"X", "Label", "Weight"},
-                         {"ignore_index", "reduction"},
-                         {"Out", "Total_weight"});
-}
-
-KernelSignature NllLossGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("nll_loss_grad",
-                         {"X", "Label", "Weight", "Total_weight", "Out@GRAD"},
-                         {"ignore_index", "reduction"},
-                         {"X@GRAD"});
-}
-
-}  // namespace phi
-PD_REGISTER_ARG_MAPPING_FN(nll_loss_grad, phi::NllLossGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(nll_loss, phi::NllLossOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/qr_sig.cc b/paddle/phi/ops/compat/qr_sig.cc
deleted file mode 100644
index dbe1cd86434f5..0000000000000
--- a/paddle/phi/ops/compat/qr_sig.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature QrOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("qr", {"X"}, {"mode"}, {"Q", "R"});
-}
-
-KernelSignature QrGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "qr_grad", {"X", "Q", "R", "Q@GRAD", "R@GRAD"}, {"mode"}, {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(qr, phi::QrOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(qr_grad, phi::QrGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/renorm_sig.cc b/paddle/phi/ops/compat/renorm_sig.cc
deleted file mode 100644
index 0c5198dff37b0..0000000000000
--- a/paddle/phi/ops/compat/renorm_sig.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature RenormOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  VLOG(3) << "in renrom arguments mapping";
-  return KernelSignature("renorm", {"X"}, {"p", "axis", "max_norm"}, {"Out"});
-}
-
-KernelSignature RenormGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  VLOG(3) << "in renrom grad arguments mapping";
-  return KernelSignature(
-      "renorm_grad", {"X", "Out@GRAD"}, {"p", "axis", "max_norm"}, {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(renorm, phi::RenormOpArgumentMapping);
-
-PD_REGISTER_ARG_MAPPING_FN(renorm_grad, phi::RenormGradOpArgumentMapping);