diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index f37f8f0d6a1e8..ccb5e1e5320d5 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -32,9 +32,6 @@
 #ifdef PADDLE_WITH_NVTX
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
-#ifdef PADDLE_WITH_LITE
-#include "paddle/fluid/operators/lite/lite_engine_op.h"
-#endif
 
 namespace paddle {
 namespace framework {
@@ -334,38 +331,7 @@ void NaiveExecutor::ResetTrtOps(int num) {
 #endif
 }
 
-void NaiveExecutor::CloneLiteEngine(int num, void *stream) {
-#ifdef PADDLE_WITH_LITE
-  for (auto &op : ops_) {
-    if (op->Type() == "lite_engine") {
-      operators::LiteEngineOp *lite_op =
-          dynamic_cast<operators::LiteEngineOp *>(op.get());
-      PADDLE_ENFORCE_NOT_NULL(
-          lite_op,
-          phi::errors::InvalidArgument(
-              "lite_op(type: lite_engine) should be created."));
-      std::string engine_key = lite_op->Attr<std::string>("engine_key");
-      std::string new_engine_key = engine_key + "_" + std::to_string(num);
-      PADDLE_ENFORCE(
-          paddle::inference::Singleton<inference::lite::EngineManager>::Global()
-              .Has(engine_key),
-          phi::errors::InvalidArgument(
-              "lite_engine(key: %s) should be created.", engine_key));
-      auto *lite_engine =
-          paddle::inference::Singleton<inference::lite::EngineManager>::Global()
-              .Get(engine_key);
-      auto new_lite_engine = lite_engine->Clone();
-#ifdef LITE_SUBGRAPH_WITH_XPU
-      new_lite_engine->SetStream(TARGET(kXPU), stream);
-#endif
-      paddle::inference::Singleton<inference::lite::EngineManager>::Global()
-          .Set(new_engine_key, new_lite_engine);
-      lite_op->SetAttr("engine_key", new_engine_key);
-      lite_op->SetEngine(new_lite_engine.get());
-    }
-  }
-#endif
-}
+void NaiveExecutor::CloneLiteEngine(int num, void *stream) {}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 280f24bdd6fa6..2b342add94906 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -19,7 +19,6 @@ add_subdirectory(controlflow)
 add_subdirectory(detection)
 add_subdirectory(elementwise)
 add_subdirectory(fused)
-add_subdirectory(metrics)
 add_subdirectory(optimizers)
 add_subdirectory(reduce_ops)
 add_subdirectory(sequence_ops)
@@ -49,10 +48,6 @@ if (WITH_DLNNE)
     add_subdirectory(dlnne)
 endif()
 
-if (WITH_LITE)
-    add_subdirectory(lite)
-endif()
-
 if(WITH_CINN)
     add_subdirectory(cinn)
 endif()
diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc
deleted file mode 100644
index a40ba84610293..0000000000000
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/ctc_align_op.h"
-
-namespace paddle {
-namespace operators {
-
-class CTCAlignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ctc_align");
-    OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "ctc_align");
-
-    auto input_dims = ctx->GetInputDim("Input");
-
-    // TODO(wanghaoshuang): it is tricky to set the wrong dimension here.
-    ctx->SetOutputDim("Output", input_dims);
-    if (ctx->HasInput("InputLength")) {
-      ctx->SetOutputDim("OutputLength", {input_dims[0], 1});
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input",
-             "2-D Tensor or LodTensor with  shape "
-             "[Lp, 1], where Lp is the sum of all input sequences' length.");
-    AddInput("InputLength",
-             "2-D Tensor with shape [batch_size, 1], "
-             " When Input is padding mode, InputLength is length of every "
-             "sequence in Input.")
-        .AsDispensable();
-    AddOutput("Output", "(Tensor, default: Tensor<int>), The align result.");
-    AddOutput("OutputLength",
-              "2-D Tensor with shape [batch_size, 1], "
-              "When Input is padding mode, OutputLength is length of every "
-              "sequence in Output.")
-        .AsDispensable();
-    AddAttr<int>("blank",
-                 "(int, default: 0), the blank label set in Connectionist "
-                 "Temporal Classification (CTC) op.")
-        .SetDefault(0);
-    AddAttr<bool>("merge_repeated",
-                  "(bool, default: true), whether to "
-                  "merge repeated elements between two blanks. ")
-        .SetDefault(true);
-    // add attr padding number for tensor input
-    AddAttr<int>("padding_value",
-                 "(int, default: 0), padding number "
-                 "use to padding tensor. ")
-        .SetDefault(0);
-    AddComment(R"DOC(
-CTCAlign op is used to merge repeated elements between two blanks
-and then delete all blanks in sequence.
-
-Given:
-    Input.data = [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6,
-                  6, 0, 0, 7, 7, 7, 0]
-    Input.dims = {18, 1}
-    Input.LoD = [[0, 11, 18]]
-
-And:
-    blank = 0
-    merge_repeated = True
-
-Then:
-    Output.data = [1, 2, 4, 4, 5, 6,
-                   6, 7]
-    Output.dims = {8, 1}
-    Output.LoD = [[0, 6, 8]]
-or Given:
-    Input.data = [[0, 1, 2, 2, 0, 4],
-                  [0, 4, 5, 0, 6, 0],
-                  [0, 7, 7, 7, 0, 0]]
-    InputLength.data  = [[6],
-                         [5],
-                         [4]],
-    Input.dims = {3, 6},
-    Input.Lod = []
-And:
-    blank = 0
-    merge_repeated = True
-    padding_value = 0
-
-Then:
-    Output.data = [[1, 2, 4, 0, 0, 0],
-                   [4, 5, 6, 0, 0, 0],
-                   [7, 0, 0, 0, 0, 0]],
-    OutputLength.data = [[3],
-                         [3],
-                         [1]],
-    Output.dims = {3, 6},
-    Output.Lod = []
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    ctc_align,
-    ops::CTCAlignOp,
-    ops::CTCAlignOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(
-    ctc_align, CPU, ALL_LAYOUT, ops::CTCAlignKernel, int, int64_t) {}
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
deleted file mode 100644
index 76466ed12ab88..0000000000000
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-
-#include <vector>
-
-#include "paddle/fluid/operators/ctc_align_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void MergeAndDelCudaKernel(const int64_t num_token,
-                                      const T* tokens,
-                                      const size_t num_seq,
-                                      size_t* lod0,
-                                      const int blank,
-                                      const int merge_repeated,
-                                      size_t* out_lod0,
-                                      T* output) {
-  int output_idx = 0;
-  out_lod0[0] = 0;
-
-  for (int i = 0; i < num_seq; ++i) {
-    T pre_token = -1;
-    for (int j = lod0[i]; j < lod0[i + 1]; ++j) {
-      if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) {
-        output[output_idx] = tokens[j];
-        ++output_idx;
-      }
-      pre_token = tokens[j];
-    }
-    out_lod0[i + 1] = output_idx;
-  }
-}
-
-template <typename T>
-__global__ void PaddingMergeAndDelCudaKernel(const int64_t num_token,
-                                             const T* tokens,
-                                             const T* tokens_length,
-                                             const int blank,
-                                             const int merge_repeated,
-                                             const int padding_value,
-                                             const int64_t batch_size,
-                                             T* output,
-                                             T* output_length) {
-  int ind = blockIdx.x * blockDim.x + threadIdx.x;
-  if (ind >= batch_size) return;
-  int output_idx = ind * num_token;
-  T prev_token = -1;
-  for (int i = ind * num_token; i < ind * num_token + tokens_length[ind]; i++) {
-    if ((unsigned)tokens[i] != blank &&
-        !(merge_repeated && tokens[i] == prev_token)) {
-      output[output_idx] = tokens[i];
-      ++output_idx;
-    }
-    prev_token = tokens[i];
-  }
-  output_length[ind] = output_idx - ind * num_token;
-  for (int i = output_idx; i < ind * num_token + num_token; i++) {
-    output[i] = padding_value;
-  }
-}
-
-template <typename T, typename DeviceContext>
-class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "CTCAlign operator CUDA kernel must use CUDAPlace "
-                          "rather than CPUPlace."));
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* output = ctx.Output<phi::DenseTensor>("Output");
-    const int blank = ctx.Attr<int>("blank");
-    const int merge_repeated =
-        static_cast<int>(ctx.Attr<bool>("merge_repeated"));
-    const T* tokens = input->data<T>();
-    auto stream = ctx.cuda_device_context().stream();
-
-    // tensor input which has no lod
-    if (input->lod().empty()) {
-      const int padding_value = ctx.Attr<int>("padding_value");
-      auto input_dims = input->dims();
-      T* output_data = output->mutable_data<T>({input_dims[0], input_dims[1]},
-                                               ctx.GetPlace());
-      auto* input_length = ctx.Input<phi::DenseTensor>("InputLength");
-      const T* input_length_data = input_length->data<T>();
-      auto* output_length = ctx.Output<phi::DenseTensor>("OutputLength");
-      T* output_length_data =
-          output_length->mutable_data<T>({input_dims[0], 1}, ctx.GetPlace());
-      PaddingMergeAndDelCudaKernel<T>
-          <<<32, (input_dims[0] + 32 - 1) / 32, 0, stream>>>(
-              input_dims[1],
-              tokens,
-              input_length_data,
-              blank,
-              merge_repeated,
-              padding_value,
-              input_dims[0],
-              output_data,
-              output_length_data);
-    } else {
-      const size_t level = 0;
-      auto input_lod = framework::ToAbsOffset(input->lod());
-
-      const int64_t num_tokens = input->dims()[0];
-      const size_t num_seq = input_lod[level].size() - 1;
-
-      // prepare a lod to record lod information while merging elements
-      thrust::device_vector<size_t> dev_out_lod0(input_lod[level].size());
-      size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data());
-
-      // merge elements and delete blank
-      T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());
-
-      phi::MixVector<size_t> mixv_input_lod(&input_lod[level]);
-      MergeAndDelCudaKernel<T>
-          <<<1, 1, 0, stream>>>(num_tokens,
-                                tokens,
-                                num_seq,
-                                mixv_input_lod.CUDAMutableData(ctx.GetPlace()),
-                                blank,
-                                merge_repeated,
-                                dev_out_lod0_ptr,
-                                output_data);
-      mixv_input_lod.CopyToCPU();
-
-      // set output lod
-      std::vector<size_t> host_out_lod0(dev_out_lod0.begin(),
-                                        dev_out_lod0.end());
-      framework::LoD out_lod;
-      out_lod.push_back(host_out_lod0);
-      output->set_lod(out_lod);
-
-      // resize output dims
-      output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
-
-      if (host_out_lod0.back() == 0) {
-        output->Resize({1, 1});
-        output->mutable_data<T>(ctx.GetPlace());
-        phi::funcs::SetConstant<phi::GPUContext, T> set_constant;
-        set_constant(
-            ctx.template device_context<phi::GPUContext>(), output, -1);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(
-    ctc_align, GPU, ALL_LAYOUT, ops::CTCAlignOpCUDAKernel, int, int64_t) {}
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
deleted file mode 100644
index 9ebfa7196ecc5..0000000000000
--- a/paddle/fluid/operators/ctc_align_op.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string.h>
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class CTCAlignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* output = ctx.Output<phi::DenseTensor>("Output");
-    size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
-    bool merge_repeated = ctx.Attr<bool>("merge_repeated");
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    auto input_dims = common::vectorize<int>(input->dims());
-    const T* input_data = input->data<T>();
-
-    // support tensor input, no lod information
-    if (input->lod().empty()) {
-      size_t padding_value =
-          static_cast<size_t>(ctx.Attr<int>("padding_value"));
-      auto* input_length = ctx.Input<phi::DenseTensor>("InputLength");
-      const T* input_length_data = input_length->data<T>();
-
-      auto* output_length = ctx.Output<phi::DenseTensor>("OutputLength");
-      T* output_length_data = output_length->mutable_data<T>(ctx.GetPlace());
-
-      for (size_t batch_id = 0; batch_id < (unsigned)input_dims[0];
-           batch_id++) {
-        T prev_token = -1;
-        size_t output_idx = 0;
-        for (size_t i = 0; i < (unsigned)input_length_data[batch_id]; i++) {
-          size_t input_ind = batch_id * input_dims[1] + i;
-          if ((unsigned)input_data[input_ind] != blank &&
-              !(merge_repeated && input_data[input_ind] == prev_token)) {
-            output_data[batch_id * input_dims[1] + output_idx] =
-                input_data[input_ind];
-            ++output_idx;
-          }
-          prev_token = input_data[input_ind];
-        }
-        output_length_data[batch_id] = output_idx;
-        for (size_t j = output_idx; j < (unsigned)input_dims[1]; j++)
-          output_data[batch_id * input_dims[1] + j] = padding_value;
-      }
-    } else {
-      const size_t level = 0;
-      auto input_lod = framework::ToAbsOffset(input->lod());
-
-      // check input dims and lod
-      PADDLE_ENFORCE_EQ(
-          input_dims[0],
-          static_cast<int64_t>(input_lod[level].back()),
-          phi::errors::InvalidArgument(
-              "The first dimension %d of CTCAlign operator Input(Input) should "
-              "be equal to "
-              "the sum of all sequences' lengths %d.",
-              input_dims[0],
-              static_cast<int64_t>(input_lod[level].back())));
-
-      const size_t num_sequences = input_lod[level].size() - 1;
-
-      // merge repeated tokens and delete blank
-      size_t output_idx = 0;
-      std::vector<size_t> output_lod0(1, 0);
-      for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
-        T prev_token = -1;
-        for (size_t i = input_lod[level][seq_idx];
-             i < input_lod[level][seq_idx + 1];
-             ++i) {
-          if ((unsigned)input_data[i] != blank &&
-              !(merge_repeated && input_data[i] == prev_token)) {
-            output_data[output_idx] = input_data[i];
-            ++output_idx;
-          }
-          prev_token = input_data[i];
-        }
-        output_lod0.push_back(output_idx);
-      }
-
-      // set output lod
-      framework::LoD output_lod;
-      output_lod.push_back(output_lod0);
-      output->set_lod(output_lod);
-      // resize output dims
-      output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
-      // for empty sequence
-      if (output_lod0.back() == 0) {
-        output->Resize({1, 1});
-        output_data = output->mutable_data<T>(ctx.GetPlace());
-        output_data[0] = -1;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/dequeue_op.cc b/paddle/fluid/operators/dequeue_op.cc
deleted file mode 100644
index 8fcc0fbfb47da..0000000000000
--- a/paddle/fluid/operators/dequeue_op.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
-using LoDTensorBlockingQueueHolder =
-    paddle::operators::reader::LoDTensorBlockingQueueHolder;
-
-namespace paddle {
-namespace operators {
-
-class DequeueOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  DequeueOp(const std::string& type,
-            const framework::VariableNameMap& inputs,
-            const framework::VariableNameMap& outputs,
-            const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const std::string& queue_name = Attr<std::string>("queue_name");
-    auto* queue_holder_var = scope.FindVar(queue_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        queue_holder_var,
-        phi::errors::NotFound(
-            "No LoDTensorBlockingQueueHolder variable with name %s found.",
-            queue_name));
-    auto* queue_holder =
-        queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
-    auto& out_names = Outputs("Out");
-    PADDLE_ENFORCE_GT(out_names.size(),
-                      0,
-                      phi::errors::InvalidArgument(
-                          "The output for Op(dequeue) must be set."));
-    for (const auto& out_name : out_names) {
-      auto out_var = scope.FindVar(out_name);
-      PADDLE_ENFORCE_NOT_NULL(
-          out_var,
-          phi::errors::NotFound("No variable with name %s found", out_name));
-      auto* out_tensor = out_var->GetMutable<phi::DenseTensor>();
-      PADDLE_ENFORCE_NOT_NULL(
-          out_tensor,
-          phi::errors::InvalidArgument(
-              "Variable with name %s has not been initialized.", out_name));
-
-      paddle::framework::LoDTensorArray lod_tensor_vec;
-      bool success = false;
-      lod_tensor_vec = queue_holder->GetQueue()->Pop(&success);
-      PADDLE_ENFORCE_EQ(lod_tensor_vec.size(),
-                        1,
-                        phi::errors::InvalidArgument(
-                            "Expected to pop only one element per Pop call for "
-                            "Op(dequeue), but poped %d element.",
-                            lod_tensor_vec.size()));
-      for (auto& lod_tensor : lod_tensor_vec) {
-        paddle::framework::TensorCopySync(lod_tensor, dev_place, out_tensor);
-      }
-    }
-  }
-};
-
-class DequeueOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<std::string>("queue_name",
-                         "Name of the `LoDTensorBlockingQueueHolder` variable");
-    AddOutput("Out", "A list of `lod_tensor` to dequeue and assigned.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-      Dequeue operator.
-      )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = ::paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(dequeue, ops::DequeueOp, ops::DequeueOpMaker);
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 84b5ab20144e3..9c8914d14a647 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -36,15 +36,10 @@ detection_library(anchor_generator_op SRCS anchor_generator_op.cc
                   anchor_generator_op.cu)
 detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
                   polygon_box_transform_op.cu)
-detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
-detection_library(generate_proposal_labels_op SRCS
-                  generate_proposal_labels_op.cc)
 detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc DEPS phi common)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
                   box_decoder_and_assign_op.cu)
-detection_library(retinanet_detection_output_op SRCS
-                  retinanet_detection_output_op.cc)
 
 if(WITH_GPU OR WITH_ROCM)
   if(WITH_GPU)
@@ -67,8 +62,3 @@ endif()
 
 #Export local libraries to parent
 # set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
-
-cc_library(mask_util SRCS mask_util.cc)
-
-detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS
-                  mask_util)
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
deleted file mode 100644
index 5ee843d72387b..0000000000000
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ /dev/null
@@ -1,547 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/detection/mask_util.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-const int kBoxDim = 4;
-
-template <typename T>
-void AppendMask(phi::DenseTensor* out,
-                int64_t offset,
-                phi::DenseTensor* to_add) {
-  auto* out_data = out->data<T>();
-  auto* to_add_data = to_add->data<T>();
-  memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
-}
-
-class GenerateMaskLabelsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("ImInfo"),
-        true,
-        phi::errors::InvalidArgument("Input(ImInfo) shouldn't be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("GtClasses"),
-        true,
-        phi::errors::InvalidArgument("Input(GtClasses) shouldn't be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("IsCrowd"),
-        true,
-        phi::errors::InvalidArgument("Input(IsCrowd) shouldn't be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("GtSegms"),
-        true,
-        phi::errors::InvalidArgument("Input(GtSegms) shouldn't be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Rois"),
-        true,
-        phi::errors::InvalidArgument("Input(Rois) shouldn't be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("LabelsInt32"),
-        true,
-        phi::errors::InvalidArgument("Input(LabelsInt32) shouldn't be null."));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("MaskRois"),
-        true,
-        phi::errors::InvalidArgument(
-            "Output(MaskRois) of GenerateMaskLabelsOp should not be null"));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("RoiHasMaskInt32"),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Output(RoiHasMaskInt32) of GenerateMaskLabelsOp "
-                          "should not be null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("MaskInt32"),
-        true,
-        phi::errors::InvalidArgument(
-            "Output(MaskInt32) of GenerateMaskLabelsOp should not be null"));
-
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-    auto gt_segms_dims = ctx->GetInputDim("GtSegms");
-    PADDLE_ENFORCE_EQ(
-        im_info_dims.size(),
-        2,
-        phi::errors::InvalidArgument("The rank of Input(ImInfo) must be 2."));
-    PADDLE_ENFORCE_EQ(
-        gt_segms_dims.size(),
-        2,
-        phi::errors::InvalidArgument("The rank of Input(GtSegms) must be 2."));
-    PADDLE_ENFORCE_EQ(gt_segms_dims[1],
-                      2,
-                      phi::errors::InvalidArgument(
-                          "The second dim of Input(GtSegms) must be 2."));
-    int num_classes = ctx->Attrs().Get<int>("num_classes");
-    int resolution = ctx->Attrs().Get<int>("resolution");
-
-    ctx->SetOutputDim("MaskRois", {-1, 4});
-    ctx->SetOutputDim("RoiHasMaskInt32", {-1, 1});
-    ctx->SetOutputDim("MaskInt32", {-1, num_classes * resolution * resolution});
-    if (!ctx->IsRuntime()) {
-      ctx->SetLoDLevel("MaskRois", ctx->GetLoDLevel("Rois"));
-      ctx->SetLoDLevel("RoiHasMaskInt32", ctx->GetLoDLevel("Rois"));
-      ctx->SetLoDLevel("MaskInt32", ctx->GetLoDLevel("Rois"));
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Rois");
-    return phi::KernelKey(data_type, platform::CPUPlace());
-  }
-};
-
-/*
- * Expand masks from shape (#masks, M ** 2) to (#masks, #classes * M ** 2)
- * to encode class specific mask targets.
- */
-template <typename T>
-static inline void ExpandMaskTarget(const phi::CPUContext& ctx,
-                                    const phi::DenseTensor& masks,
-                                    const phi::DenseTensor& mask_class_labels,
-                                    const int resolution,
-                                    const int num_classes,
-                                    phi::DenseTensor* mask_targets) {
-  const uint8_t* masks_data = masks.data<uint8_t>();
-  int64_t num_mask = masks.dims()[0];
-  const int* mask_class_labels_data = mask_class_labels.data<int>();
-  const int M = resolution * resolution;
-  const int mask_dim = M * num_classes;
-
-  int* mask_targets_data =
-      mask_targets->mutable_data<int>({num_mask, mask_dim}, ctx.GetPlace());
-  phi::funcs::set_constant(ctx, mask_targets, static_cast<int>(-1));
-  for (int64_t mask_id = 0; mask_id < num_mask; ++mask_id) {
-    int cls = mask_class_labels_data[mask_id];
-    int start = M * cls;
-    if (cls > 0) {
-      for (int i = 0; i < M; ++i) {
-        mask_targets_data[mask_id * mask_dim + start + i] =
-            static_cast<int>(masks_data[mask_id * M + i]);
-      }
-    }
-  }
-}
-
-template <typename T>
-std::vector<phi::DenseTensor> SampleMaskForOneImage(
-    const phi::CPUContext& ctx,
-    const phi::DenseTensor& im_info,
-    const phi::DenseTensor& gt_classes,
-    const phi::DenseTensor& is_crowd,
-    const phi::DenseTensor& gt_segms,
-    const phi::DenseTensor& rois,
-    const phi::DenseTensor& label_int32,
-    const int num_classes,
-    const int resolution,
-    const framework::LoD& segm_length) {
-  // Prepare the mask targets by associating one gt mask to each training roi
-  // that has a fg (non-bg) class label.
-  const int64_t gt_size = static_cast<int64_t>(gt_classes.dims()[0]);
-  const int64_t roi_size = static_cast<int64_t>(rois.dims()[0]);
-  const int* gt_classes_data = gt_classes.data<int>();
-  const int* is_crowd_data = is_crowd.data<int>();
-  const int* label_int32_data = label_int32.data<int>();
-  PADDLE_ENFORCE_EQ(roi_size,
-                    label_int32.dims()[0],
-                    phi::errors::InvalidArgument(
-                        "The first dim of label [%d] is the different from "
-                        "roi_size [%d], they should be same.",
-                        label_int32.dims()[0],
-                        roi_size));
-
-  std::vector<int> mask_gt_inds, fg_inds;
-  std::vector<std::vector<std::vector<T>>> gt_polys;
-
-  auto polys_num = segm_length[1];
-  auto segm_lod_offset = framework::ConvertToOffsetBasedLoD(segm_length);
-  auto lod1 = segm_lod_offset[1];
-  auto lod2 = segm_lod_offset[2];
-  const T* polys_data = gt_segms.data<T>();
-  for (int64_t i = 0; i < gt_size; ++i) {
-    if ((gt_classes_data[i] > 0) && (is_crowd_data[i] == 0)) {
-      mask_gt_inds.emplace_back(i);
-
-      // slice fg segmentation polys
-      int poly_num = static_cast<int>(polys_num[i]);
-      std::vector<std::vector<T>> polys;
-      int s_idx = static_cast<int>(lod1[i]);
-      for (int j = 0; j < poly_num; ++j) {
-        int s = static_cast<int>(lod2[s_idx + j]);
-        int e = static_cast<int>(lod2[s_idx + j + 1]);
-        PADDLE_ENFORCE_NE(s,
-                          e,
-                          phi::errors::InvalidArgument(
-                              "The start point and the end point in the poly "
-                              "segment [%d] should not be same, but received "
-                              "the start point [%d] and the end point [%d].",
-                              i,
-                              s,
-                              e));
-        std::vector<T> plts(polys_data + s * 2, polys_data + e * 2);
-        polys.push_back(plts);
-      }
-      gt_polys.push_back(polys);
-    }
-  }
-  for (int64_t i = 0; i < roi_size; ++i) {
-    if (label_int32_data[i] > 0) {
-      fg_inds.emplace_back(i);
-    }
-  }
-  int gt_num = static_cast<int>(mask_gt_inds.size());
-  int fg_num = static_cast<int>(fg_inds.size());
-
-  phi::DenseTensor boxes_from_polys;
-  boxes_from_polys.mutable_data<T>({gt_num, 4}, platform::CPUPlace());
-  Poly2Boxes(gt_polys, boxes_from_polys.data<T>());
-
-  std::vector<int> roi_has_mask =
-      std::vector<int>(fg_inds.begin(), fg_inds.end());
-  phi::DenseTensor mask_class_labels;
-  phi::DenseTensor masks;
-  phi::DenseTensor rois_fg;
-
-  auto im_scale = im_info.data<T>()[2];
-  if (fg_num > 0) {
-    // Class labels for the foreground rois
-    mask_class_labels.mutable_data<int>({fg_num, 1}, ctx.GetPlace());
-    Gather<int>(label_int32_data,
-                1,
-                fg_inds.data(),
-                static_cast<int>(fg_inds.size()),
-                mask_class_labels.data<int>());
-
-    uint8_t* masks_data = masks.mutable_data<uint8_t>(
-        {fg_num, resolution * resolution}, ctx.GetPlace());
-
-    // Find overlap between all foreground rois and the bounding boxes
-    // enclosing each segmentation
-    T* rois_fg_data = rois_fg.mutable_data<T>({fg_num, 4}, ctx.GetPlace());
-    Gather<T>(
-        rois.data<T>(), 4, fg_inds.data(), fg_inds.size(), rois_fg.data<T>());
-
-    for (int k = 0; k < rois_fg.numel(); ++k) {
-      rois_fg_data[k] = rois_fg_data[k] / im_scale;
-    }
-
-    phi::DenseTensor overlaps_bbfg_bbpolys;
-    overlaps_bbfg_bbpolys.mutable_data<T>({fg_num, gt_num}, ctx.GetPlace());
-    BboxOverlaps<T>(rois_fg, boxes_from_polys, &overlaps_bbfg_bbpolys);
-
-    // Map from each fg rois to the index of the mask with highest overlap
-    // (measured by bbox overlap)
-    T* overlaps_bbfg_bbpolys_data = overlaps_bbfg_bbpolys.data<T>();
-    std::vector<int> fg_masks_inds;
-    for (int64_t i = 0; i < fg_num; ++i) {
-      const T* v = overlaps_bbfg_bbpolys_data + i * gt_num;
-      T max_overlap = std::numeric_limits<T>::min();
-      int id = 0;
-      for (int64_t j = 0; j < gt_num; ++j) {
-        if (v[j] > max_overlap) {
-          max_overlap = v[j];
-          id = static_cast<int>(j);
-        }
-      }
-      fg_masks_inds.push_back(id);
-    }
-
-    // add fg targets
-    for (int64_t i = 0; i < fg_num; ++i) {
-      int fg_polys_ind = fg_masks_inds[i];
-      T* roi_fg = rois_fg_data + i * 4;
-      uint8_t* mask = masks_data + i * resolution * resolution;
-      Polys2MaskWrtBox(gt_polys[fg_polys_ind], roi_fg, resolution, mask);
-    }
-  } else {
-    // The network cannot handle empty blobs, so we must provide a mask
-    // We simply take the first bg roi, given it an all -1's mask (ignore
-    // label), and label it with class zero (bg).
-    int bg_num = 1;
-    T* rois_fg_data = rois_fg.mutable_data<T>({bg_num, 4}, ctx.GetPlace());
-    const T* rois_data = rois.data<T>();
-    std::vector<int> bg_inds;
-    for (int64_t i = 0; i < roi_size; ++i) {
-      if (label_int32_data[i] == 0) {
-        bg_inds.emplace_back(i);
-        rois_fg_data[0] = rois_data[0] / im_scale;
-        rois_fg_data[1] = rois_data[1] / im_scale;
-        rois_fg_data[2] = rois_data[2] / im_scale;
-        rois_fg_data[3] = rois_data[3] / im_scale;
-        break;
-      }
-    }
-    masks.mutable_data<uint8_t>({bg_num, resolution * resolution},
-                                ctx.GetPlace());
-    phi::funcs::set_constant(ctx, &masks, static_cast<uint8_t>(-1));
-    int* mask_class_labels_data =
-        mask_class_labels.mutable_data<int>({bg_num, 1}, ctx.GetPlace());
-    mask_class_labels_data[0] = 0;
-    roi_has_mask = std::vector<int>(bg_inds.begin(), bg_inds.end());
-  }
-
-  phi::DenseTensor masks_expand;
-  ExpandMaskTarget<T>(
-      ctx, masks, mask_class_labels, resolution, num_classes, &masks_expand);
-
-  T* rois_fg_data = rois_fg.data<T>();
-  for (int k = 0; k < rois_fg.numel(); ++k) {
-    rois_fg_data[k] = rois_fg_data[k] * im_scale;
-  }
-
-  phi::DenseTensor roi_has_mask_t;
-  int roi_has_mask_size = static_cast<int>(roi_has_mask.size());
-  int* roi_has_mask_data =
-      roi_has_mask_t.mutable_data<int>({roi_has_mask_size, 1}, ctx.GetPlace());
-  std::copy(roi_has_mask.begin(), roi_has_mask.end(), roi_has_mask_data);
-
-  std::vector<phi::DenseTensor> res;
-  res.emplace_back(rois_fg);
-  res.emplace_back(roi_has_mask_t);
-  res.emplace_back(masks_expand);
-  return res;
-}
-
-template <typename T, typename DeviceContext>
-class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* im_info = ctx.Input<phi::DenseTensor>("ImInfo");
-    auto* gt_classes = ctx.Input<phi::DenseTensor>("GtClasses");
-    auto* is_crowd = ctx.Input<phi::DenseTensor>("IsCrowd");
-    auto* gt_segms = ctx.Input<phi::DenseTensor>("GtSegms");
-    auto* rois = ctx.Input<phi::DenseTensor>("Rois");
-    auto* label_int32 = ctx.Input<phi::DenseTensor>("LabelsInt32");
-
-    auto* mask_rois = ctx.Output<phi::DenseTensor>("MaskRois");
-    auto* roi_has_mask_int32 = ctx.Output<phi::DenseTensor>("RoiHasMaskInt32");
-    auto* mask_int32 = ctx.Output<phi::DenseTensor>("MaskInt32");
-
-    int num_classes = ctx.Attr<int>("num_classes");
-    int resolution = ctx.Attr<int>("resolution");
-
-    PADDLE_ENFORCE_EQ(
-        gt_classes->lod().size(),
-        1UL,
-        phi::errors::InvalidArgument(
-            "GenerateMaskLabelsOp gt_classes needs 1 level of LoD"));
-    PADDLE_ENFORCE_EQ(
-        is_crowd->lod().size(),
-        1UL,
-        phi::errors::InvalidArgument(
-            "GenerateMaskLabelsOp is_crowd needs 1 level of LoD"));
-    PADDLE_ENFORCE_EQ(rois->lod().size(),
-                      1UL,
-                      phi::errors::InvalidArgument(
-                          "GenerateMaskLabelsOp rois needs 1 level of LoD"));
-    PADDLE_ENFORCE_EQ(
-        label_int32->lod().size(),
-        1UL,
-        phi::errors::InvalidArgument(
-            "GenerateMaskLabelsOp label_int32 needs 1 level of LoD"));
-
-    PADDLE_ENFORCE_EQ(
-        gt_segms->lod().size(),
-        3UL,
-        phi::errors::InvalidArgument(
-            "GenerateMaskLabelsOp gt_segms needs 3 level of LoD"));
-
-    int64_t n = static_cast<int64_t>(gt_classes->lod().back().size() - 1);
-    PADDLE_ENFORCE_EQ(
-        gt_segms->lod()[0].size() - 1,
-        n,
-        phi::errors::InvalidArgument(
-            "Batchsize of Input(gt_segms) and Input(gt_classes) should be "
-            "same, but received gt_segms[%d], gt_classes[%d].",
-            gt_segms->lod()[0].size() - 1,
-            n));
-
-    int mask_dim = num_classes * resolution * resolution;
-    int roi_num = static_cast<int>(rois->lod().back()[n]);
-    mask_rois->mutable_data<T>({roi_num, kBoxDim}, ctx.GetPlace());
-    roi_has_mask_int32->mutable_data<int>({roi_num, 1}, ctx.GetPlace());
-    mask_int32->mutable_data<int>({roi_num, mask_dim}, ctx.GetPlace());
-
-    framework::LoD lod;
-    std::vector<size_t> lod0(1, 0);
-
-    int64_t num_mask = 0;
-    auto& dev_ctx = ctx.device_context<phi::CPUContext>();
-
-    auto gt_classes_lod = gt_classes->lod().back();
-    auto is_crowd_lod = is_crowd->lod().back();
-    auto rois_lod = rois->lod().back();
-    auto label_int32_lod = label_int32->lod().back();
-    auto gt_segms_lod = gt_segms->lod();
-
-    for (int i = 0; i < n; ++i) {
-      if (rois_lod[i] == rois_lod[i + 1]) {
-        lod0.emplace_back(num_mask);
-        continue;
-      }
-      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
-      phi::DenseTensor gt_classes_slice =
-          gt_classes->Slice(static_cast<int64_t>(gt_classes_lod[i]),
-                            static_cast<int64_t>(gt_classes_lod[i + 1]));
-      phi::DenseTensor is_crowd_slice =
-          is_crowd->Slice(static_cast<int64_t>(is_crowd_lod[i]),
-                          static_cast<int64_t>(is_crowd_lod[i + 1]));
-      phi::DenseTensor label_int32_slice =
-          label_int32->Slice(static_cast<int64_t>(label_int32_lod[i]),
-                             static_cast<int64_t>(label_int32_lod[i + 1]));
-      phi::DenseTensor rois_slice =
-          rois->Slice(static_cast<int64_t>(rois_lod[i]),
-                      static_cast<int64_t>(rois_lod[i + 1]));
-
-      auto sub_lod_and_offset =
-          framework::GetSubLoDAndAbsoluteOffset(gt_segms_lod, i, i + 1, 0);
-      auto lod_length = sub_lod_and_offset.first;
-      size_t s = sub_lod_and_offset.second.first;
-      size_t e = sub_lod_and_offset.second.second;
-      phi::DenseTensor gt_segms_slice =
-          gt_segms->Slice(static_cast<int64_t>(s), static_cast<int64_t>(e));
-
-      std::vector<phi::DenseTensor> tensor_output =
-          SampleMaskForOneImage<T>(dev_ctx,
-                                   im_info_slice,
-                                   gt_classes_slice,
-                                   is_crowd_slice,
-                                   gt_segms_slice,
-                                   rois_slice,
-                                   label_int32_slice,
-                                   num_classes,
-                                   resolution,
-                                   lod_length);
-
-      phi::DenseTensor sampled_mask_rois = tensor_output[0];
-      phi::DenseTensor sampled_roi_has_mask_int32 = tensor_output[1];
-      phi::DenseTensor sampled_mask_int32 = tensor_output[2];
-
-      AppendMask<T>(mask_rois, kBoxDim * num_mask, &sampled_mask_rois);
-      AppendMask<int>(
-          roi_has_mask_int32, num_mask, &sampled_roi_has_mask_int32);
-      AppendMask<int>(mask_int32, mask_dim * num_mask, &sampled_mask_int32);
-
-      num_mask += sampled_mask_rois.dims()[0];
-      lod0.emplace_back(num_mask);
-    }
-
-    lod.emplace_back(lod0);
-    mask_rois->set_lod(lod);
-    roi_has_mask_int32->set_lod(lod);
-    mask_int32->set_lod(lod);
-    mask_rois->Resize({num_mask, kBoxDim});
-    roi_has_mask_int32->Resize({num_mask, 1});
-    mask_int32->Resize({num_mask, mask_dim});
-  }
-};
-
-class GenerateMaskLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("ImInfo",
-             "(Tensor), This input is a 2D Tensor with shape [B, 3]. "
-             "B is the number of input images, "
-             "each element consists of im_height, im_width, im_scale.");
-    AddInput("GtClasses",
-             "(phi::DenseTensor), This input is a 2D phi::DenseTensor with "
-             "shape [M, 1]. "
-             "M is the number of groundtruth, "
-             "each element is a class label of groundtruth.");
-    AddInput(
-        "IsCrowd",
-        "(phi::DenseTensor), This input is a 2D phi::DenseTensor with shape "
-        "[M, 1]. "
-        "M is the number of groundtruth, "
-        "each element is a flag indicates whether a groundtruth is crowd.");
-    AddInput(
-        "GtSegms",
-        "(phi::DenseTensor), This input is a 2D phi::DenseTensor with shape "
-        "[S, 2], it's LoD "
-        "level is 3. The LoD[0] represents the gt objects number of each "
-        "instance. LoD[1] represents the segmentation counts of each objects. "
-        "LoD[2] represents the polygons number of each segmentation. S the "
-        "total number of polygons coordinate points. Each element is (x, y) "
-        "coordinate points.");
-    AddInput(
-        "Rois",
-        "(phi::DenseTensor), This input is a 2D phi::DenseTensor with shape "
-        "[R, 4]. "
-        "R is the number of rois which is the output of "
-        "generate_proposal_labels, "
-        "each element is a bounding box with (xmin, ymin, xmax, ymax) format.");
-    AddInput("LabelsInt32",
-             "(phi::DenseTensor), This intput is a 2D phi::DenseTensor with "
-             "shape [R, 1], "
-             "each element represents a class label of a roi");
-    AddOutput(
-        "MaskRois",
-        "(phi::DenseTensor), This output is a 2D phi::DenseTensor with shape "
-        "[P, 4]. "
-        "P is the number of mask, "
-        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
-    AddOutput("RoiHasMaskInt32",
-              "(phi::DenseTensor), This output is a 2D phi::DenseTensor with "
-              "shape [P, 1], "
-              "each element represents the output mask rois index with regard "
-              "to input rois");
-    AddOutput("MaskInt32",
-              "(phi::DenseTensor), This output is a 4D phi::DenseTensor with "
-              "shape [P, Q], "
-              "Q equal to num_classes * resolution * resolution");
-
-    AddAttr<int>("num_classes", "Class number.");
-    AddAttr<int>("resolution", "Resolution of mask.");
-
-    AddComment(R"DOC(
-This operator can be, for given the RoIs and corresponding labels,
-to sample foreground RoIs. This mask branch also has
-a :math: `K \\times M^{2}` dimensional output targets for each foreground
-RoI, which encodes K binary masks of resolution M x M, one for each of the
-K classes. This mask targets are used to compute loss of mask branch.
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    generate_mask_labels,
-    ops::GenerateMaskLabelsOp,
-    ops::GenerateMaskLabelsOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(generate_mask_labels,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::GenerateMaskLabelsKernel,
-                          float) {}
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
deleted file mode 100644
index ad37aa2ae682f..0000000000000
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ /dev/null
@@ -1,837 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/phi/kernels/funcs/gather.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-const int kBoxDim = 4;
-
-template <typename T>
-void AppendRois(phi::DenseTensor* out,
-                int64_t offset,
-                phi::DenseTensor* to_add) {
-  auto* out_data = out->data<T>();
-  auto* to_add_data = to_add->data<T>();
-  memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
-}
-
-// Filter the ground-truth in RoIs and the RoIs with non-positive area.
-// The ground-truth has max overlap with itself so the max_overlap is 1
-// and the corresponding RoI will be removed.
-template <typename T>
-void FilterRoIs(const platform::DeviceContext& ctx,
-                const phi::DenseTensor& rpn_rois,
-                const phi::DenseTensor& max_overlap,
-                phi::DenseTensor* keep) {
-  const T* rpn_rois_dt = rpn_rois.data<T>();
-  const T* max_overlap_dt = max_overlap.data<T>();
-  int rois_num = static_cast<int>(max_overlap.numel());
-  keep->Resize({rois_num});
-  int* keep_data = keep->mutable_data<int>(ctx.GetPlace());
-  int keep_len = 0;
-  for (int i = 0; i < rois_num; ++i) {
-    if ((rpn_rois_dt[i * 4 + 2] - rpn_rois_dt[i * 4 + 0] + 1) > 0 &&
-        (rpn_rois_dt[i * 4 + 3] - rpn_rois_dt[i * 4 + 1] + 1) > 0 &&
-        max_overlap_dt[i] < 1.) {
-      keep_data[keep_len++] = i;
-    }
-  }
-  keep->Resize({keep_len});
-}
-
-class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("RpnRois"),
-        true,
-        phi::errors::NotFound("Input(RpnRois) shouldn't be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("GtClasses"),
-        true,
-        phi::errors::NotFound("Input(GtClasses) shouldn't be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("IsCrowd"),
-        true,
-        phi::errors::NotFound("Input(IsCrowd) shouldn't be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("GtBoxes"),
-        true,
-        phi::errors::NotFound("Input(GtBoxes) shouldn't be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("ImInfo"),
-        true,
-        phi::errors::NotFound("Input(ImInfo) shouldn't be null."));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Rois"),
-        true,
-        phi::errors::NotFound(
-            "Output(Rois) of GenerateProposalLabelsOp should not be null"));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("LabelsInt32"),
-                      true,
-                      phi::errors::NotFound("Output(LabelsInt32) of "
-                                            "GenerateProposalLabelsOp "
-                                            "should not be null"));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("BboxTargets"),
-                      true,
-                      phi::errors::NotFound("Output(BboxTargets) of "
-                                            "GenerateProposalLabelsOp "
-                                            "should not be null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("BboxInsideWeights"),
-        true,
-        phi::errors::NotFound(
-            "Output(BboxInsideWeights) of GenerateProposalLabelsOp "
-            "should not be null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("BboxOutsideWeights"),
-        true,
-        phi::errors::NotFound(
-            "Output(BboxOutsideWeights) of GenerateProposalLabelsOp "
-            "should not be null"));
-
-    auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
-    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-
-    PADDLE_ENFORCE_EQ(rpn_rois_dims.size(),
-                      2,
-                      phi::errors::InvalidArgument(
-                          "The dimensions size of Input(RpnRois) must be 2. "
-                          "But received dimensions size=[%d], dimensions=[%s].",
-                          rpn_rois_dims.size(),
-                          rpn_rois_dims));
-    PADDLE_ENFORCE_EQ(gt_boxes_dims.size(),
-                      2,
-                      phi::errors::InvalidArgument(
-                          "The dimensions size of Input(GtBoxes) must be 2. "
-                          "But received dimensions size=[%d], dimensions=[%s].",
-                          gt_boxes_dims.size(),
-                          gt_boxes_dims));
-    PADDLE_ENFORCE_EQ(im_info_dims.size(),
-                      2,
-                      phi::errors::InvalidArgument(
-                          "The dimensions size of Input(ImInfo) must be 2. But "
-                          "received dimensions size=[%d], dimensions=[%s].",
-                          im_info_dims.size(),
-                          im_info_dims));
-
-    int class_nums = ctx->Attrs().Get<int>("class_nums");
-    bool is_cascade_rcnn = ctx->Attrs().Get<bool>("is_cascade_rcnn");
-    if (is_cascade_rcnn) {
-      PADDLE_ENFORCE_EQ(
-          ctx->HasInput("MaxOverlap"),
-          true,
-          phi::errors::NotFound(
-              "Input(MaxOverlap) of GenerateProposalLabelsOp "
-              "should not be null when is_cascade_rcnn is True."));
-    }
-
-    ctx->SetOutputDim("Rois", {-1, 4});
-    ctx->SetOutputDim("LabelsInt32", {-1, 1});
-    ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums});
-    ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums});
-    ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums});
-    ctx->SetOutputDim("MaxOverlapWithGT", {-1});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "RpnRois");
-    return phi::KernelKey(data_type, platform::CPUPlace());
-  }
-};
-
-template <typename T>
-void Concat(const phi::CPUContext& context,
-            const phi::DenseTensor& in_tensor_a,
-            const phi::DenseTensor& in_tensor_b,
-            phi::DenseTensor* out_tensor) {
-  int axis = 0;
-  std::vector<phi::DenseTensor> inputs;
-  inputs.emplace_back(in_tensor_a);
-  inputs.emplace_back(in_tensor_b);
-  math::ConcatFunctor<phi::CPUContext, T> concat_functor;
-  concat_functor(context, inputs, axis, out_tensor);
-}
-
-template <typename T>
-std::vector<std::vector<int>> SampleFgBgGt(const phi::CPUContext& context,
-                                           phi::DenseTensor* iou,
-                                           const phi::DenseTensor& is_crowd,
-                                           const int batch_size_per_im,
-                                           const float fg_fraction,
-                                           const float fg_thresh,
-                                           const float bg_thresh_hi,
-                                           const float bg_thresh_lo,
-                                           std::minstd_rand engine,
-                                           const bool use_random,
-                                           const bool is_cascade_rcnn,
-                                           const phi::DenseTensor& rpn_rois) {
-  std::vector<int> fg_inds;
-  std::vector<int> bg_inds;
-  std::vector<int> mapped_gt_inds;
-  int64_t gt_num = is_crowd.numel();
-  const int* crowd_data = is_crowd.data<int>();
-  T* proposal_to_gt_overlaps = iou->data<T>();
-  int64_t row = iou->dims()[0];
-  int64_t col = iou->dims()[1];
-  float epsilon = 0.00001;
-  // Follow the Faster RCNN's implementation
-  for (int64_t i = 0; i < row; ++i) {
-    const T* v = proposal_to_gt_overlaps + i * col;
-
-    T max_overlap = *std::max_element(v, v + col);
-    if ((i < gt_num) && (crowd_data[i])) {
-      max_overlap = -1.0;
-    }
-    if (max_overlap >= fg_thresh) {
-      // fg mapped gt label index
-      for (int64_t j = 0; j < col; ++j) {
-        T val = proposal_to_gt_overlaps[i * col + j];
-        auto diff = std::abs(max_overlap - val);
-        if (diff < epsilon) {
-          fg_inds.emplace_back(i);
-          mapped_gt_inds.emplace_back(j);
-          break;
-        }
-      }
-    } else if ((max_overlap >= bg_thresh_lo) && (max_overlap < bg_thresh_hi)) {
-      bg_inds.emplace_back(i);
-    } else {
-      continue;
-    }
-  }
-
-  std::vector<std::vector<int>> res;
-  if (is_cascade_rcnn) {
-    res.emplace_back(fg_inds);
-    res.emplace_back(bg_inds);
-    res.emplace_back(mapped_gt_inds);
-  } else {
-    // Reservoir Sampling
-    // sampling fg
-    std::uniform_real_distribution<float> uniform(0, 1);
-    int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction);  // NOLINT
-    int fg_rois_this_image = static_cast<int>(fg_inds.size());
-    int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image);
-    if (use_random) {
-      const int64_t fg_size = static_cast<int64_t>(fg_inds.size());
-      if (fg_size > fg_rois_per_this_image) {
-        for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) {
-          int rng_ind = std::floor(uniform(engine) * i);  // NOLINT
-          if (rng_ind < fg_rois_per_this_image) {
-            std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i);
-            std::iter_swap(mapped_gt_inds.begin() + rng_ind,
-                           mapped_gt_inds.begin() + i);
-          }
-        }
-      }
-    }
-    std::vector<int> new_fg_inds(fg_inds.begin(),
-                                 fg_inds.begin() + fg_rois_per_this_image);
-    std::vector<int> new_gt_inds(
-        mapped_gt_inds.begin(),
-        mapped_gt_inds.begin() + fg_rois_per_this_image);
-    // sampling bg
-    int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image;
-    int bg_rois_this_image = static_cast<int>(bg_inds.size());
-    int bg_rois_per_this_image =
-        std::min(bg_rois_per_image, bg_rois_this_image);
-    if (use_random) {
-      const int64_t bg_size = static_cast<int64_t>(bg_inds.size());
-      if (bg_size > bg_rois_per_this_image) {
-        for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) {
-          int rng_ind = std::floor(uniform(engine) * i);  // NOLINT
-          if (rng_ind < fg_rois_per_this_image)
-            std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i);
-        }
-      }
-    }
-    std::vector<int> new_bg_inds(bg_inds.begin(),
-                                 bg_inds.begin() + bg_rois_per_this_image);
-    //
-    res.emplace_back(new_fg_inds);
-    res.emplace_back(new_bg_inds);
-    res.emplace_back(new_gt_inds);
-  }
-
-  return res;
-}
-
-template <typename T>
-void GatherBoxesLabels(const phi::CPUContext& context,
-                       const phi::DenseTensor& boxes,
-                       const phi::DenseTensor& max_overlap,
-                       const phi::DenseTensor& gt_boxes,
-                       const phi::DenseTensor& gt_classes,
-                       const std::vector<int>& fg_inds,
-                       const std::vector<int>& bg_inds,
-                       const std::vector<int>& gt_inds,
-                       phi::DenseTensor* sampled_boxes,
-                       phi::DenseTensor* sampled_labels,
-                       phi::DenseTensor* sampled_gts,
-                       phi::DenseTensor* sampled_max_overlap) {
-  int fg_num = static_cast<int>(fg_inds.size());
-  int bg_num = static_cast<int>(bg_inds.size());
-  phi::DenseTensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
-  int* fg_inds_data = fg_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
-  int* bg_inds_data = bg_inds_t.mutable_data<int>({bg_num}, context.GetPlace());
-  int* gt_box_inds_data =
-      gt_box_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
-  int* gt_label_inds_data =
-      gt_label_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
-  std::copy(fg_inds.begin(), fg_inds.end(), fg_inds_data);
-  std::copy(bg_inds.begin(), bg_inds.end(), bg_inds_data);
-  std::copy(gt_inds.begin(), gt_inds.end(), gt_box_inds_data);
-  std::copy(gt_inds.begin(), gt_inds.end(), gt_label_inds_data);
-
-  phi::DenseTensor fg_boxes, bg_boxes, fg_labels, bg_labels;
-  fg_boxes.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
-  phi::funcs::CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
-  bg_boxes.mutable_data<T>({bg_num, kBoxDim}, context.GetPlace());
-  phi::funcs::CPUGather<T>(context, boxes, bg_inds_t, &bg_boxes);
-  Concat<T>(context, fg_boxes, bg_boxes, sampled_boxes);
-  phi::funcs::CPUGather<T>(context, gt_boxes, gt_box_inds_t, sampled_gts);
-  fg_labels.mutable_data<int>({fg_num}, context.GetPlace());
-  phi::funcs::CPUGather<int>(context, gt_classes, gt_label_inds_t, &fg_labels);
-  bg_labels.mutable_data<int>({bg_num}, context.GetPlace());
-  phi::funcs::set_constant(context, &bg_labels, static_cast<int>(0));
-  Concat<int>(context, fg_labels, bg_labels, sampled_labels);
-
-  phi::DenseTensor fg_max_overlap, bg_max_overlap;
-  fg_max_overlap.mutable_data<T>({fg_num}, context.GetPlace());
-  phi::funcs::CPUGather<T>(context, max_overlap, fg_inds_t, &fg_max_overlap);
-  bg_max_overlap.mutable_data<T>({bg_num}, context.GetPlace());
-  phi::funcs::CPUGather<T>(context, max_overlap, bg_inds_t, &bg_max_overlap);
-  Concat<T>(context, fg_max_overlap, bg_max_overlap, sampled_max_overlap);
-}
-
-template <typename T>
-std::vector<phi::DenseTensor> SampleRoisForOneImage(
-    const phi::CPUContext& context,
-    const phi::DenseTensor& rpn_rois_in,
-    const phi::DenseTensor& gt_classes,
-    const phi::DenseTensor& is_crowd,
-    const phi::DenseTensor& gt_boxes,
-    const phi::DenseTensor& im_info,
-    const int batch_size_per_im,
-    const float fg_fraction,
-    const float fg_thresh,
-    const float bg_thresh_hi,
-    const float bg_thresh_lo,
-    const std::vector<float>& bbox_reg_weights,
-    const int class_nums,
-    std::minstd_rand engine,
-    bool use_random,
-    bool is_cascade_rcnn,
-    bool is_cls_agnostic,
-    const phi::DenseTensor& max_overlap) {
-  // 1.1 map to original image
-  auto im_scale = im_info.data<T>()[2];
-  phi::DenseTensor rpn_rois;
-  rpn_rois.mutable_data<T>(rpn_rois_in.dims(), context.GetPlace());
-  const T* rpn_rois_in_dt = rpn_rois_in.data<T>();
-  T* rpn_rois_dt = rpn_rois.data<T>();
-
-  for (int i = 0; i < rpn_rois.numel(); ++i) {
-    rpn_rois_dt[i] = rpn_rois_in_dt[i] / im_scale;
-  }
-
-  int proposals_num = 1;
-
-  if (is_cascade_rcnn) {
-    phi::DenseTensor keep;
-    FilterRoIs<T>(context, rpn_rois, max_overlap, &keep);
-    phi::DenseTensor roi_filter;
-    // phi::DenseTensor box_filter;
-    if (keep.numel() == 0) {
-      phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
-      roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
-      set_zero(context, &roi_filter, static_cast<T>(0));
-    } else {
-      proposals_num = static_cast<int>(keep.numel());
-      roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
-      phi::funcs::CPUGather<T>(context, rpn_rois, keep, &roi_filter);
-    }
-    T* roi_filter_dt = roi_filter.data<T>();
-    memcpy(rpn_rois_dt, roi_filter_dt, roi_filter.numel() * sizeof(T));
-    rpn_rois.Resize(roi_filter.dims());
-  } else {
-    proposals_num = static_cast<int>(rpn_rois.dims()[0]);
-  }
-  // 1.2 compute overlaps
-  proposals_num += static_cast<int>(gt_boxes.dims()[0]);
-
-  phi::DenseTensor proposal_to_gt_overlaps;
-  proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes.dims()[0]},
-                                          context.GetPlace());
-
-  phi::DenseTensor boxes;
-  boxes.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
-  Concat<T>(context, gt_boxes, rpn_rois, &boxes);
-  BboxOverlaps<T>(boxes, gt_boxes, &proposal_to_gt_overlaps);
-
-  phi::DenseTensor proposal_with_max_overlap;
-  proposal_with_max_overlap.mutable_data<T>({proposals_num},
-                                            context.GetPlace());
-
-  MaxIoU<T>(proposal_to_gt_overlaps, &proposal_with_max_overlap);
-
-  // Generate proposal index
-  std::vector<std::vector<int>> fg_bg_gt =
-      SampleFgBgGt<T>(context,
-                      &proposal_to_gt_overlaps,
-                      is_crowd,
-                      batch_size_per_im,
-                      fg_fraction,
-                      fg_thresh,
-                      bg_thresh_hi,
-                      bg_thresh_lo,
-                      engine,
-                      use_random,
-                      is_cascade_rcnn,
-                      boxes);
-  std::vector<int> fg_inds = fg_bg_gt[0];
-  std::vector<int> bg_inds = fg_bg_gt[1];
-  std::vector<int> mapped_gt_inds = fg_bg_gt[2];  // mapped_gt_labels
-
-  // Gather boxes and labels
-  phi::DenseTensor sampled_boxes, sampled_labels, sampled_gts,
-      sampled_max_overlap;
-  int fg_num = static_cast<int>(fg_inds.size());
-  int bg_num = static_cast<int>(bg_inds.size());
-  int boxes_num = fg_num + bg_num;
-  framework::DDim bbox_dim({boxes_num, kBoxDim});
-  sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace());
-  sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace());
-  sampled_gts.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
-  sampled_max_overlap.mutable_data<T>({boxes_num}, context.GetPlace());
-  GatherBoxesLabels<T>(context,
-                       boxes,
-                       proposal_with_max_overlap,
-                       gt_boxes,
-                       gt_classes,
-                       fg_inds,
-                       bg_inds,
-                       mapped_gt_inds,
-                       &sampled_boxes,
-                       &sampled_labels,
-                       &sampled_gts,
-                       &sampled_max_overlap);
-
-  // Compute targets
-  phi::DenseTensor bbox_targets_single;
-  bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
-  BoxToDelta<T>(fg_num,
-                sampled_boxes,
-                sampled_gts,
-                bbox_reg_weights.data(),
-                false,
-                &bbox_targets_single);
-
-  // Scale rois
-  phi::DenseTensor sampled_rois;
-  sampled_rois.mutable_data<T>(sampled_boxes.dims(), context.GetPlace());
-  auto sampled_rois_et = framework::EigenTensor<T, 2>::From(sampled_rois);
-  auto sampled_boxes_et = framework::EigenTensor<T, 2>::From(sampled_boxes);
-  sampled_rois_et = sampled_boxes_et * im_scale;
-
-  // Expand box targets
-  phi::DenseTensor bbox_targets, bbox_inside_weights, bbox_outside_weights;
-  framework::DDim bbox_expand_dim({boxes_num, kBoxDim * class_nums});
-  bbox_targets.mutable_data<T>(bbox_expand_dim, context.GetPlace());
-  bbox_inside_weights.mutable_data<T>(bbox_expand_dim, context.GetPlace());
-  bbox_outside_weights.mutable_data<T>(bbox_expand_dim, context.GetPlace());
-  phi::funcs::set_constant(context, &bbox_targets, static_cast<T>(0.0));
-  phi::funcs::set_constant(context, &bbox_inside_weights, static_cast<T>(0.0));
-  phi::funcs::set_constant(context, &bbox_outside_weights, static_cast<T>(0.0));
-
-  auto* bbox_targets_single_data = bbox_targets_single.data<T>();
-  auto* sampled_labels_data = sampled_labels.data<int>();
-  auto* bbox_targets_data = bbox_targets.data<T>();
-  auto* bbox_inside_weights_data = bbox_inside_weights.data<T>();
-  auto* bbox_outside_weights_data = bbox_outside_weights.data<T>();
-  int width = kBoxDim * class_nums;
-  for (int64_t i = 0; i < boxes_num; ++i) {
-    int label = sampled_labels_data[i];
-    if (label > 0) {
-      if (is_cls_agnostic) {
-        label = 1;
-      }
-      int dst_idx = static_cast<int>(i * width + kBoxDim * label);
-      int src_idx = static_cast<int>(kBoxDim * i);
-      bbox_targets_data[dst_idx] = bbox_targets_single_data[src_idx];
-      bbox_targets_data[dst_idx + 1] = bbox_targets_single_data[src_idx + 1];
-      bbox_targets_data[dst_idx + 2] = bbox_targets_single_data[src_idx + 2];
-      bbox_targets_data[dst_idx + 3] = bbox_targets_single_data[src_idx + 3];
-      bbox_inside_weights_data[dst_idx] = 1;
-      bbox_inside_weights_data[dst_idx + 1] = 1;
-      bbox_inside_weights_data[dst_idx + 2] = 1;
-      bbox_inside_weights_data[dst_idx + 3] = 1;
-      bbox_outside_weights_data[dst_idx] = 1;
-      bbox_outside_weights_data[dst_idx + 1] = 1;
-      bbox_outside_weights_data[dst_idx + 2] = 1;
-      bbox_outside_weights_data[dst_idx + 3] = 1;
-    }
-  }
-  std::vector<phi::DenseTensor> res;
-  res.emplace_back(sampled_rois);
-  res.emplace_back(sampled_labels);
-  res.emplace_back(bbox_targets);
-  res.emplace_back(bbox_inside_weights);
-  res.emplace_back(bbox_outside_weights);
-  res.emplace_back(sampled_max_overlap);
-  return res;
-}
-
-template <typename T, typename DeviceContext>
-class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* rpn_rois = context.Input<phi::DenseTensor>("RpnRois");
-    auto* gt_classes = context.Input<phi::DenseTensor>("GtClasses");
-    auto* is_crowd = context.Input<phi::DenseTensor>("IsCrowd");
-    auto* gt_boxes = context.Input<phi::DenseTensor>("GtBoxes");
-    auto* im_info = context.Input<phi::DenseTensor>("ImInfo");
-
-    auto* rois = context.Output<phi::DenseTensor>("Rois");
-    auto* labels_int32 = context.Output<phi::DenseTensor>("LabelsInt32");
-    auto* bbox_targets = context.Output<phi::DenseTensor>("BboxTargets");
-    auto* bbox_inside_weights =
-        context.Output<phi::DenseTensor>("BboxInsideWeights");
-    auto* bbox_outside_weights =
-        context.Output<phi::DenseTensor>("BboxOutsideWeights");
-    auto* max_overlap_with_gt =
-        context.Output<phi::DenseTensor>("MaxOverlapWithGT");
-
-    int batch_size_per_im = context.Attr<int>("batch_size_per_im");
-    float fg_fraction = context.Attr<float>("fg_fraction");
-    float fg_thresh = context.Attr<float>("fg_thresh");
-    float bg_thresh_hi = context.Attr<float>("bg_thresh_hi");
-    float bg_thresh_lo = context.Attr<float>("bg_thresh_lo");
-    std::vector<float> bbox_reg_weights =
-        context.Attr<std::vector<float>>("bbox_reg_weights");
-    int class_nums = context.Attr<int>("class_nums");
-    bool use_random = context.Attr<bool>("use_random");
-    bool is_cascade_rcnn = context.Attr<bool>("is_cascade_rcnn");
-    bool is_cls_agnostic = context.Attr<bool>("is_cls_agnostic");
-    PADDLE_ENFORCE_EQ(
-        rpn_rois->lod().size(),
-        1UL,
-        phi::errors::InvalidArgument(
-            "GenerateProposalLabelsOp rpn_rois needs 1 level of LoD. But "
-            "received level of LoD is [%d], LoD is [%s].",
-            rpn_rois->lod().size(),
-            rpn_rois->lod()));
-    PADDLE_ENFORCE_EQ(
-        gt_classes->lod().size(),
-        1UL,
-        phi::errors::InvalidArgument(
-            "GenerateProposalLabelsOp gt_classes needs 1 level of LoD. But "
-            "received level of LoD is [%d], LoD is [%s].",
-            gt_classes->lod().size(),
-            gt_classes->lod()));
-    PADDLE_ENFORCE_EQ(
-        is_crowd->lod().size(),
-        1UL,
-        phi::errors::InvalidArgument(
-            "GenerateProposalLabelsOp is_crowd needs 1 level of LoD. But "
-            "received level of LoD is [%d], LoD is [%s].",
-            is_crowd->lod().size(),
-            is_crowd->lod()));
-    PADDLE_ENFORCE_EQ(
-        gt_boxes->lod().size(),
-        1UL,
-        phi::errors::InvalidArgument(
-            "GenerateProposalLabelsOp gt_boxes needs 1 level of LoD. But "
-            "received level of LoD is [%d], LoD is [%s].",
-            gt_boxes->lod().size(),
-            gt_boxes->lod()));
-    int64_t n = static_cast<int64_t>(rpn_rois->lod().back().size() - 1);
-    int64_t rois_num = rpn_rois->dims()[0];
-    int64_t gts_num = gt_boxes->dims()[0];
-    int64_t init_num =
-        is_cascade_rcnn ? rois_num + gts_num : n * batch_size_per_im;
-
-    rois->mutable_data<T>({init_num, kBoxDim}, context.GetPlace());
-    labels_int32->mutable_data<int>({init_num, 1}, context.GetPlace());
-    bbox_targets->mutable_data<T>({init_num, kBoxDim * class_nums},
-                                  context.GetPlace());
-    bbox_inside_weights->mutable_data<T>({init_num, kBoxDim * class_nums},
-                                         context.GetPlace());
-    bbox_outside_weights->mutable_data<T>({init_num, kBoxDim * class_nums},
-                                          context.GetPlace());
-    max_overlap_with_gt->Resize({init_num});
-    max_overlap_with_gt->mutable_data<T>(context.GetPlace());
-
-    std::random_device rnd;
-    std::minstd_rand engine;
-    int seed = static_cast<int>(rnd());
-    engine.seed(seed);
-
-    framework::LoD lod;
-    std::vector<size_t> lod0(1, 0);
-
-    int64_t num_rois = 0;
-    auto& dev_ctx = context.device_context<phi::CPUContext>();
-
-    auto rpn_rois_lod = rpn_rois->lod().back();
-    auto gt_classes_lod = gt_classes->lod().back();
-    auto is_crowd_lod = is_crowd->lod().back();
-    auto gt_boxes_lod = gt_boxes->lod().back();
-    for (int i = 0; i < n; ++i) {
-      if (rpn_rois_lod[i] == rpn_rois_lod[i + 1]) {
-        lod0.emplace_back(num_rois);
-        continue;
-      }
-      phi::DenseTensor rpn_rois_slice =
-          rpn_rois->Slice(static_cast<int64_t>(rpn_rois_lod[i]),
-                          static_cast<int64_t>(rpn_rois_lod[i + 1]));
-      phi::DenseTensor gt_classes_slice =
-          gt_classes->Slice(static_cast<int64_t>(gt_classes_lod[i]),
-                            static_cast<int64_t>(gt_classes_lod[i + 1]));
-      phi::DenseTensor is_crowd_slice =
-          is_crowd->Slice(static_cast<int64_t>(is_crowd_lod[i]),
-                          static_cast<int64_t>(is_crowd_lod[i + 1]));
-      phi::DenseTensor gt_boxes_slice =
-          gt_boxes->Slice(static_cast<int64_t>(gt_boxes_lod[i]),
-                          static_cast<int64_t>(gt_boxes_lod[i + 1]));
-      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
-      phi::DenseTensor max_overlap_slice;
-      if (is_cascade_rcnn) {
-        auto* max_overlap = context.Input<phi::DenseTensor>("MaxOverlap");
-        max_overlap_slice =
-            max_overlap->Slice(static_cast<int64_t>(rpn_rois_lod[i]),
-                               static_cast<int64_t>(rpn_rois_lod[i + 1]));
-      } else {
-        max_overlap_slice.mutable_data<T>({rpn_rois_slice.dims()[0]},
-                                          context.GetPlace());
-      }
-      std::vector<phi::DenseTensor> tensor_output =
-          SampleRoisForOneImage<T>(dev_ctx,
-                                   rpn_rois_slice,
-                                   gt_classes_slice,
-                                   is_crowd_slice,
-                                   gt_boxes_slice,
-                                   im_info_slice,
-                                   batch_size_per_im,
-                                   fg_fraction,
-                                   fg_thresh,
-                                   bg_thresh_hi,
-                                   bg_thresh_lo,
-                                   bbox_reg_weights,
-                                   class_nums,
-                                   engine,
-                                   use_random,
-                                   is_cascade_rcnn,
-                                   is_cls_agnostic,
-                                   max_overlap_slice);
-      phi::DenseTensor sampled_rois = tensor_output[0];
-      phi::DenseTensor sampled_labels_int32 = tensor_output[1];
-      phi::DenseTensor sampled_bbox_targets = tensor_output[2];
-      phi::DenseTensor sampled_bbox_inside_weights = tensor_output[3];
-      phi::DenseTensor sampled_bbox_outside_weights = tensor_output[4];
-      phi::DenseTensor sampled_max_overlap = tensor_output[5];
-
-      AppendRois<T>(rois, kBoxDim * num_rois, &sampled_rois);
-      AppendRois<int>(labels_int32, num_rois, &sampled_labels_int32);
-      int64_t offset = kBoxDim * num_rois * class_nums;
-      AppendRois<T>(bbox_targets, offset, &sampled_bbox_targets);
-      AppendRois<T>(bbox_inside_weights, offset, &sampled_bbox_inside_weights);
-      AppendRois<T>(
-          bbox_outside_weights, offset, &sampled_bbox_outside_weights);
-      AppendRois<T>(max_overlap_with_gt, num_rois, &sampled_max_overlap);
-
-      num_rois += sampled_rois.dims()[0];
-      lod0.emplace_back(num_rois);
-    }
-
-    lod.emplace_back(lod0);
-    rois->set_lod(lod);
-    labels_int32->set_lod(lod);
-    bbox_targets->set_lod(lod);
-    bbox_inside_weights->set_lod(lod);
-    bbox_outside_weights->set_lod(lod);
-    rois->Resize({num_rois, kBoxDim});
-    labels_int32->Resize({num_rois, 1});
-    bbox_targets->Resize({num_rois, kBoxDim * class_nums});
-    bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums});
-    bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums});
-    max_overlap_with_gt->Resize({num_rois});
-    max_overlap_with_gt->set_lod(lod);
-  }
-};
-
-class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "RpnRois",
-        "(phi::DenseTensor), This input is a 2D phi::DenseTensor with shape "
-        "[N, 4]. "
-        "N is the number of the GenerateProposalOp's output, "
-        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
-    AddInput("GtClasses",
-             "(phi::DenseTensor), This input is a 2D phi::DenseTensor with "
-             "shape [M, 1]. "
-             "M is the number of groundtruth, "
-             "each element is a class label of groundtruth.");
-    AddInput(
-        "IsCrowd",
-        "(phi::DenseTensor), This input is a 2D phi::DenseTensor with shape "
-        "[M, 1]. "
-        "M is the number of groundtruth, "
-        "each element is a flag indicates whether a groundtruth is crowd.");
-    AddInput(
-        "GtBoxes",
-        "(phi::DenseTensor), This input is a 2D phi::DenseTensor with shape "
-        "[M, 4]. "
-        "M is the number of groundtruth, "
-        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
-    AddInput("ImInfo",
-             "(Tensor), This input is a 2D Tensor with shape [B, 3]. "
-             "B is the number of input images, "
-             "each element consists of im_height, im_width, im_scale.");
-    AddInput("MaxOverlap",
-             "(phi::DenseTensor), This input is a 1D phi::DenseTensor with "
-             "shape [N]."
-             "N is the number of Input(RpnRois), "
-             "each element is the maximum overlap between "
-             "the proposal RoI and ground-truth.")
-        .AsDispensable();
-
-    AddOutput(
-        "Rois",
-        "(phi::DenseTensor), This output is a 2D phi::DenseTensor with shape "
-        "[P, 4]. "
-        "P usuall equal to  batch_size_per_im * batch_size, "
-        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
-    AddOutput("LabelsInt32",
-              "(phi::DenseTensor), This output is a 2D phi::DenseTensor with "
-              "shape [P, 1], "
-              "each element represents a class label of a roi");
-    AddOutput("BboxTargets",
-              "(phi::DenseTensor), This output is a 2D phi::DenseTensor with "
-              "shape [P, 4 * "
-              "class_nums], "
-              "each element represents a box label of a roi");
-    AddOutput(
-        "BboxInsideWeights",
-        "(phi::DenseTensor), This output is a 2D phi::DenseTensor with shape "
-        "[P, 4 * "
-        "class_nums], "
-        "each element indicates whether a box should contribute to loss.");
-    AddOutput(
-        "BboxOutsideWeights",
-        "(phi::DenseTensor), This output is a 2D phi::DenseTensor with shape "
-        "[P, 4 * "
-        "class_nums], "
-        "each element indicates whether a box should contribute to loss.");
-    AddOutput("MaxOverlapWithGT",
-              "(phi::DenseTensor), This output is a 1D phi::DenseTensor with "
-              "shape [P], "
-              "each element indicates the maxoverlap "
-              "between output RoIs and ground-truth. "
-              "The output RoIs may include ground-truth "
-              "and the output maxoverlap may contain 1.");
-
-    AddAttr<int>("batch_size_per_im", "Batch size of rois per images.");
-    AddAttr<float>("fg_fraction",
-                   "Foreground fraction in total batch_size_per_im.");
-    AddAttr<float>(
-        "fg_thresh",
-        "Overlap threshold which is used to chose foreground sample.");
-    AddAttr<float>("bg_thresh_hi",
-                   "Overlap threshold upper bound which is used to chose "
-                   "background sample.");
-    AddAttr<float>("bg_thresh_lo",
-                   "Overlap threshold lower bound which is used to chose "
-                   "background sample.");
-    AddAttr<std::vector<float>>("bbox_reg_weights", "Box regression weights.");
-    AddAttr<int>("class_nums", "Class number.");
-    AddAttr<bool>(
-        "use_random",
-        "Use random sampling to choose foreground and background boxes.")
-        .SetDefault(true);
-    AddAttr<bool>("is_cascade_rcnn",
-                  "cascade rcnn sampling policy changed from stage 2.")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "is_cls_agnostic",
-        "the box regress will only include fg and bg locations if set true ")
-        .SetDefault(false);
-
-    AddComment(R"DOC(
-This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
-to sample foreground boxes and background boxes, and compute loss target.
-
-RpnRois is the output boxes of RPN and was processed by generate_proposal_op, these boxes
-were combined with groundtruth boxes and sampled according to batch_size_per_im and fg_fraction,
-If an instance with a groundtruth overlap greater than fg_thresh, then it was considered as a foreground sample.
-If an instance with a groundtruth overlap greater than bg_thresh_lo and lower than bg_thresh_hi,
-then it was considered as a background sample.
-After all foreground and background boxes are chosen (so called Rois),
-then we apply random sampling to make sure
-the number of foreground boxes is no more than batch_size_per_im * fg_fraction.
-
-For each box in Rois, we assign the classification (class label) and regression targets (box label) to it.
-Finally BboxInsideWeights and BboxOutsideWeights are used to specify whether it would contribute to training loss.
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    generate_proposal_labels,
-    ops::GenerateProposalLabelsOp,
-    ops::GenerateProposalLabelsOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-PD_REGISTER_STRUCT_KERNEL(generate_proposal_labels,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::GenerateProposalLabelsKernel,
-                          float,
-                          double) {}
-
-REGISTER_OP_VERSION(generate_proposal_labels)
-    .AddCheckpoint(
-        R"ROC(
-              Upgrade of output [MaxOverlapWithGT])ROC",
-        paddle::framework::compatible::OpVersionDesc().NewOutput(
-            "MaxOverlapWithGT",
-            "The maxoverlap between output RoIs and ground-truth."))
-    .AddCheckpoint(
-        R"ROC(
-              Upgrade generate_proposal_labels add a new input [MaxOverlap])ROC",
-        paddle::framework::compatible::OpVersionDesc().NewInput(
-            "MaxOverlap", "MaxOverlap is dispensable."));
diff --git a/paddle/fluid/operators/detection/mask_util.cc b/paddle/fluid/operators/detection/mask_util.cc
deleted file mode 100644
index 5b4dc92f4f6af..0000000000000
--- a/paddle/fluid/operators/detection/mask_util.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/mask_util.h"
-
-#include <cmath>
-#include <cstdlib>
-
-#include "paddle/fluid/memory/memory.h"
-
-namespace paddle {
-namespace operators {
-
-uint32_t UMax(uint32_t a, uint32_t b) { return (a > b) ? a : b; }
-
-static inline int Compare(const void* a, const void* b) {
-  uint32_t c = *(reinterpret_cast<const uint32_t*>(a));
-  uint32_t d = *(reinterpret_cast<const uint32_t*>(b));
-  return c > d ? 1 : c < d ? -1 : 0;
-}
-
-void Decode(const uint32_t* cnts, int m, uint8_t* mask) {
-  uint8_t v = 0;
-  for (int j = 0; j < m; j++) {
-    for (uint32_t k = 0; k < cnts[j]; k++) {
-      *(mask++) = v;
-    }
-    v = !v;
-  }
-}
-
-typedef uint32_t uint;
-void Poly2Mask(const float* xy, int k, int h, int w, uint8_t* mask) {
-  int j = 0, m = 0;
-  double scale = 5;
-  int *x = nullptr, *y = nullptr, *u = nullptr, *v = nullptr;
-  uint *a = nullptr, *b = nullptr;
-  platform::CPUPlace cpu;
-  auto xptr = memory::Alloc(cpu, sizeof(int) * (k + 1) * 2);
-  x = reinterpret_cast<int*>(xptr->ptr());
-  y = x + (k + 1);
-
-  for (j = 0; j < k; j++)
-    x[j] = static_cast<int>(std::lround(scale * xy[j * 2 + 0]));
-  x[k] = x[0];
-  for (j = 0; j < k; j++)
-    y[j] = static_cast<int>(std::lround(scale * xy[j * 2 + 1]));
-  y[k] = y[0];
-  for (j = 0; j < k; j++) {
-    m += static_cast<int>(UMax(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1])) + 1);
-  }
-  auto vptr = memory::Alloc(cpu, sizeof(int) * m * 2);
-  u = reinterpret_cast<int*>(vptr->ptr());
-  v = u + m;
-  m = 0;
-  for (j = 0; j < k; j++) {
-    int xs = x[j], xe = x[j + 1], ys = y[j], ye = y[j + 1], dx = 0, dy = 0,
-        t = 0, d = 0;
-    int flip = 0;
-    double s = NAN;
-    dx = abs(xe - xs);
-    dy = abs(ys - ye);
-    flip = (dx >= dy && xs > xe) || (dx < dy && ys > ye);
-    if (flip) {
-      t = xs;
-      xs = xe;
-      xe = t;
-      t = ys;
-      ys = ye;
-      ye = t;
-    }
-    if (dx >= dy) {
-      s = dx == 0 ? 0 : static_cast<double>(ye - ys) / dx;
-      for (d = 0; d <= dx; d++) {
-        t = flip ? dx - d : d;
-        u[m] = t + xs;
-        v[m] = static_cast<int>(std::lround(ys + s * t));
-        m++;
-      }
-    } else {
-      s = dy == 0 ? 0 : static_cast<double>(xe - xs) / dy;
-      for (d = 0; d <= dy; d++) {
-        t = flip ? dy - d : d;
-        v[m] = t + ys;
-        u[m] = static_cast<int>(std::lround(xs + s * t));
-        m++;
-      }
-    }
-  }
-  /* get points along y-boundary and downsample */
-  k = m;
-  m = 0;
-  double xd = NAN, yd = NAN;
-  auto xyptr = memory::Alloc(cpu, sizeof(int) * k * 2);
-  x = reinterpret_cast<int*>(xyptr->ptr());
-  y = x + k;
-  for (j = 1; j < k; j++) {
-    if (u[j] != u[j - 1]) {
-      xd = static_cast<double>(u[j] < u[j - 1] ? u[j] : u[j] - 1);
-      xd = (xd + .5) / scale - .5;
-      if (floor(xd) != xd || xd < 0 || xd > w - 1) continue;
-      yd = static_cast<double>(v[j] < v[j - 1] ? v[j] : v[j - 1]);
-      yd = (yd + .5) / scale - .5;
-      if (yd < 0)
-        yd = 0;
-      else if (yd > h)
-        yd = h;
-      yd = ceil(yd);
-      x[m] = static_cast<int>(xd);
-      y[m] = static_cast<int>(yd);
-      m++;
-    }
-  }
-  /* compute rle encoding given y-boundary points */
-  k = m;
-  auto aptr = memory::Alloc(cpu, sizeof(uint) * (k + 1));
-  a = reinterpret_cast<uint*>(aptr->ptr());
-  for (j = 0; j < k; j++) a[j] = static_cast<uint>(x[j] * h + y[j]);
-  a[k++] = static_cast<uint>(h * w);
-
-  qsort(a, k, sizeof(uint), Compare);
-  uint p = 0;
-  for (j = 0; j < k; j++) {
-    uint t = a[j];
-    a[j] -= p;
-    p = t;
-  }
-  auto bptr = memory::Alloc(cpu, sizeof(uint32_t) * k);
-  b = reinterpret_cast<uint32_t*>(bptr->ptr());
-  j = m = 0;
-  b[m++] = a[j++];
-  while (j < k) {
-    if (a[j] > 0) {
-      b[m++] = a[j++];
-    } else {
-      j++;
-      if (j < k) b[m - 1] += a[j++];
-    }
-  }
-
-  // convert to mask
-  auto mskptr = memory::Alloc(cpu, sizeof(uint8_t) * h * w);
-  uint8_t* msk = reinterpret_cast<uint8_t*>(mskptr->ptr());
-  Decode(b, m, msk);
-
-  for (int ii = 0; ii < h; ++ii) {
-    for (int jj = 0; jj < w; ++jj) {
-      mask[ii * w + jj] = msk[jj * h + ii];
-    }
-  }
-}
-
-void Poly2Boxes(const std::vector<std::vector<std::vector<float>>>& polys,
-                float* boxes) {
-  // lists
-  for (size_t i = 0; i < polys.size(); ++i) {
-    float x0 = std::numeric_limits<float>::max();
-    float x1 = std::numeric_limits<float>::min();
-    float y0 = std::numeric_limits<float>::max();
-    float y1 = std::numeric_limits<float>::min();
-    // each list may have more than one polys
-    for (const auto& item : polys[i]) {
-      for (size_t k = 0; k < item.size() / 2; ++k) {
-        x0 = std::min(x0, item[2 * k]);
-        x1 = std::max(x1, item[2 * k]);
-        y0 = std::min(y0, item[2 * k + 1]);
-        y1 = std::max(y1, item[2 * k + 1]);
-      }
-    }
-    boxes[i * 4] = x0;
-    boxes[i * 4 + 1] = y0;
-    boxes[i * 4 + 2] = x1;
-    boxes[i * 4 + 3] = y1;
-  }
-}
-
-void Polys2MaskWrtBox(const std::vector<std::vector<float>>& polygons,
-                      const float* box,
-                      int M,
-                      uint8_t* mask) {
-  float w = box[2] - box[0];
-  float h = box[3] - box[1];
-  w = std::max(w, static_cast<float>(1.));
-  h = std::max(h, static_cast<float>(1.));
-
-  // short-circuit for case "polygons.size() == 1"
-  if (polygons.size() == 1UL) {
-    int k = static_cast<int>(polygons[0].size() / 2);
-    std::vector<float> p;
-    for (int j = 0; j < k; ++j) {
-      float pw = (polygons[0][2 * j] - box[0]) * M / w;      // NOLINT
-      float ph = (polygons[0][2 * j + 1] - box[1]) * M / h;  // NOLINT
-      p.push_back(pw);
-      p.push_back(ph);
-    }
-    Poly2Mask(p.data(), k, M, M, mask);
-
-    return;
-  }
-
-  uint8_t* msk = reinterpret_cast<uint8_t*>(
-      malloc(M * M * polygons.size() * sizeof(uint8_t)));  // NOLINT
-
-  for (size_t i = 0; i < polygons.size(); ++i) {
-    int k = static_cast<int>(polygons[i].size() / 2);
-    std::vector<float> p;
-    for (int j = 0; j < k; ++j) {
-      float pw = (polygons[i][2 * j] - box[0]) * M / w;      // NOLINT
-      float ph = (polygons[i][2 * j + 1] - box[1]) * M / h;  // NOLINT
-      p.push_back(pw);
-      p.push_back(ph);
-    }
-    uint8_t* msk_i = msk + i * M * M;
-    Poly2Mask(p.data(), k, M, M, msk_i);
-  }
-
-  for (size_t i = 0; i < polygons.size(); ++i) {
-    uint8_t* msk_i = msk + i * M * M;
-    for (int j = 0; j < M * M; ++j) {
-      if (i == 0) {
-        mask[j] = msk_i[j];
-      } else {
-        mask[j] = (mask[j] + msk_i[j]) > 0 ? 1 : 0;
-      }
-    }
-  }
-  free(msk);  // NOLINT
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/mask_util.h b/paddle/fluid/operators/detection/mask_util.h
deleted file mode 100644
index 587a9c53794de..0000000000000
--- a/paddle/fluid/operators/detection/mask_util.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stdint.h>
-
-#include <vector>
-
-#include "paddle/utils/test_macros.h"
-
-namespace paddle {
-namespace operators {
-
-TEST_API void Poly2Mask(const float* ploy, int k, int h, int w, uint8_t* mask);
-
-TEST_API void Poly2Boxes(
-    const std::vector<std::vector<std::vector<float>>>& polys, float* boxes);
-
-TEST_API void Polys2MaskWrtBox(const std::vector<std::vector<float>>& polygons,
-                               const float* box,
-                               int M,
-                               uint8_t* mask);
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
deleted file mode 100644
index f43c7ec644a76..0000000000000
--- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
+++ /dev/null
@@ -1,676 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-limitations under the License. */
-
-#include <glog/logging.h>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class RetinanetDetectionOutputOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_GE(
-        ctx->Inputs("BBoxes").size(),
-        1UL,
-        phi::errors::InvalidArgument("The length of Input(BBoxes) should "
-                                     "be greater than 0, but received "
-                                     "BBoxes length is:%d.",
-                                     ctx->Inputs("BBoxes").size()));
-    PADDLE_ENFORCE_GE(
-        ctx->Inputs("Scores").size(),
-        1UL,
-        phi::errors::InvalidArgument("The length of Input(Scores) should "
-                                     "be greater than 0, but received "
-                                     "Scores length is:%d.",
-                                     ctx->Inputs("Scores").size()));
-    PADDLE_ENFORCE_GE(
-        ctx->Inputs("Anchors").size(),
-        1UL,
-        phi::errors::InvalidArgument("The length of Input(Anchors) should "
-                                     "be greater than 0, but received "
-                                     "Anchors length is:%d.",
-                                     ctx->Inputs("Anchors").size()));
-    PADDLE_ENFORCE_EQ(
-        ctx->Inputs("BBoxes").size(),
-        ctx->Inputs("Scores").size(),
-        phi::errors::InvalidArgument(
-            "Input(BBoxes) and Input(Scores) should have the same length, but "
-            "received BBoxes length is:%d, Scores length is:%d.",
-            ctx->Inputs("BBoxes").size(),
-            ctx->Inputs("Scores").size()));
-    PADDLE_ENFORCE_EQ(
-        ctx->Inputs("BBoxes").size(),
-        ctx->Inputs("Anchors").size(),
-        phi::errors::InvalidArgument(
-            "Input(BBoxes) and Input(Anchors) should have the same length, but "
-            "received BBoxes length is:%d, Anchors length is:%d.",
-            ctx->Inputs("BBoxes").size(),
-            ctx->Inputs("Anchors").size()));
-    OP_INOUT_CHECK(ctx->HasInput("ImInfo"),
-                   "Input",
-                   "ImInfo",
-                   "retinanet_detection_output");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Out"), "Output", "Out", "retinanet_detection_output");
-
-    auto bboxes_dims = ctx->GetInputsDim("BBoxes");
-    auto scores_dims = ctx->GetInputsDim("Scores");
-    auto anchors_dims = ctx->GetInputsDim("Anchors");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-
-    const size_t b_n = bboxes_dims.size();
-    PADDLE_ENFORCE_GT(
-        b_n,
-        0,
-        phi::errors::InvalidArgument("The number of Variables in Input(BBoxes) "
-                                     "should be greater than 0, "
-                                     "but received number is:%d.",
-                                     b_n));
-    const size_t s_n = scores_dims.size();
-    PADDLE_ENFORCE_GT(
-        s_n,
-        0,
-        phi::errors::InvalidArgument("The number of Variables in Input(Scores) "
-                                     "should be greater than 0, "
-                                     "but received number is:%d.",
-                                     s_n));
-    const size_t a_n = anchors_dims.size();
-    PADDLE_ENFORCE_GT(a_n,
-                      0,
-                      phi::errors::InvalidArgument(
-                          "The number of Variables in Input(Anchors) "
-                          "should be greater than 0, "
-                          "but received number is:%d.",
-                          a_n));
-    auto bbox_dims = bboxes_dims[0];
-    auto score_dims = scores_dims[0];
-    auto anchor_dims = anchors_dims[0];
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          score_dims.size(),
-          3,
-          phi::errors::InvalidArgument(
-              "The rank of each Variable in Input(Scores) must be 3, "
-              "but received rank is:%d.",
-              score_dims.size()));
-      PADDLE_ENFORCE_EQ(
-          bbox_dims.size(),
-          3,
-          phi::errors::InvalidArgument(
-              "The rank of each Variable in Input(BBoxes) must be 3, "
-              "but received rank is:%d.",
-              bbox_dims.size()));
-      PADDLE_ENFORCE_EQ(
-          anchor_dims.size(),
-          2,
-          phi::errors::InvalidArgument(
-              "The rank of each Variable in Input(Anchors) must be 2, "
-              "but received rank is:%d.",
-              anchor_dims.size()));
-      PADDLE_ENFORCE_EQ(
-          bbox_dims[2],
-          4,
-          phi::errors::InvalidArgument(
-              "The last dimension of each Variable in Input(BBoxes) must be 4 "
-              "representing the layout of coordinate [xmin, ymin, xmax, ymax], "
-              "but received dimension is:%d.",
-              bbox_dims[2]));
-      PADDLE_ENFORCE_EQ(bbox_dims[1],
-                        score_dims[1],
-                        phi::errors::InvalidArgument(
-                            "The 2nd dimension of Variables in Input(BBoxes) "
-                            "and Input(Scores) "
-                            "must be same, which represents the number of the "
-                            "predicted boxes, "
-                            "but received BBoxes 2nd dimension is:%d, Scores "
-                            "2nd dimension is:%d.",
-                            bbox_dims[1],
-                            score_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          anchor_dims[0],
-          bbox_dims[1],
-          phi::errors::InvalidArgument(
-              "The 1st dimension of each Variables in Input(Anchors) must be "
-              "equal "
-              "to the 2nd dimension of corresponding Variables in "
-              "Input(BBoxes), "
-              "which represents the number of the predicted boxes, but "
-              "received "
-              "Anchors 1st dimension is:%d, BBoxes 2nd dimension is:%d.",
-              anchor_dims[0],
-              bbox_dims[1]));
-      PADDLE_ENFORCE_EQ(im_info_dims.size(),
-                        2,
-                        phi::errors::InvalidArgument(
-                            "The rank of Input(ImInfo) must be 2,  but "
-                            "received ImInfo rank is:%d.",
-                            im_info_dims.size()));
-    }
-    // Here the box_dims[0] is not the real dimension of output.
-    // It will be rewritten in the computing kernel.
-    ctx->SetOutputDim("Out", {bbox_dims[1], bbox_dims[2] + 2});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        OperatorWithKernel::IndicateVarDataType(ctx, "Scores");
-    return phi::KernelKey(input_data_type,
-                          platform::CPUPlace());  // ctx.GetPlace());
-  }
-};
-
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-bool SortScoreTwoPairDescend(const std::pair<float, std::pair<T, T>>& pair1,
-                             const std::pair<float, std::pair<T, T>>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-static inline void GetMaxScoreIndex(
-    const std::vector<T>& scores,
-    const T threshold,
-    int top_k,
-    std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(),
-                   sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-
-template <class T>
-static inline T BBoxArea(const std::vector<T>& box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static inline T JaccardOverlap(const std::vector<T>& box1,
-                               const std::vector<T>& box2,
-                               const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
-    T inter_w = inter_xmax - inter_xmin + norm;
-    T inter_h = inter_ymax - inter_ymin + norm;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <typename T, typename DeviceContext>
-class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
- public:
-  void NMSFast(const std::vector<std::vector<T>>& cls_dets,
-               const T nms_threshold,
-               const T eta,
-               std::vector<int>* selected_indices) const {
-    int64_t num_boxes = cls_dets.size();
-    std::vector<std::pair<T, int>> sorted_indices;
-    for (int64_t i = 0; i < num_boxes; ++i) {
-      sorted_indices.push_back(std::make_pair(cls_dets[i][4], i));
-    }
-    // Sort the score pair according to the scores in descending order
-    std::stable_sort(sorted_indices.begin(),
-                     sorted_indices.end(),
-                     SortScorePairDescend<int>);
-    selected_indices->clear();
-    T adaptive_threshold = nms_threshold;
-
-    while (!sorted_indices.empty()) {
-      const int idx = sorted_indices.front().second;
-      bool keep = true;
-      for (const auto kept_idx : *selected_indices) {
-        if (keep) {
-          T overlap = T(0.);
-          overlap = JaccardOverlap<T>(cls_dets[idx], cls_dets[kept_idx], false);
-          keep = overlap <= adaptive_threshold;
-        } else {
-          break;
-        }
-      }
-      if (keep) {
-        selected_indices->push_back(idx);
-      }
-      sorted_indices.erase(sorted_indices.begin());
-      if (keep && eta < 1 && adaptive_threshold > 0.5) {
-        adaptive_threshold *= eta;
-      }
-    }
-  }
-
-  void DeltaScoreToPrediction(
-      const std::vector<T>& bboxes_data,
-      const std::vector<T>& anchors_data,
-      T im_height,
-      T im_width,
-      T im_scale,
-      int class_num,
-      const std::vector<std::pair<T, int>>& sorted_indices,
-      std::map<int, std::vector<std::vector<T>>>* preds) const {
-    im_height = static_cast<T>(round(im_height / im_scale));
-    im_width = static_cast<T>(round(im_width / im_scale));
-    T zero(0);
-    int i = 0;
-    for (const auto& it : sorted_indices) {
-      T score = it.first;
-      int idx = it.second;
-      int a = idx / class_num;
-      int c = idx % class_num;
-
-      int box_offset = a * 4;
-      T anchor_box_width =
-          anchors_data[box_offset + 2] - anchors_data[box_offset] + 1;
-      T anchor_box_height =
-          anchors_data[box_offset + 3] - anchors_data[box_offset + 1] + 1;
-      T anchor_box_center_x = anchors_data[box_offset] + anchor_box_width / 2;
-      T anchor_box_center_y =
-          anchors_data[box_offset + 1] + anchor_box_height / 2;
-      T target_box_center_x = 0, target_box_center_y = 0;
-      T target_box_width = 0, target_box_height = 0;
-      target_box_center_x =
-          bboxes_data[box_offset] * anchor_box_width + anchor_box_center_x;
-      target_box_center_y =
-          bboxes_data[box_offset + 1] * anchor_box_height + anchor_box_center_y;
-      target_box_width =
-          std::exp(bboxes_data[box_offset + 2]) * anchor_box_width;
-      target_box_height =
-          std::exp(bboxes_data[box_offset + 3]) * anchor_box_height;
-      T pred_box_xmin = target_box_center_x - target_box_width / 2;
-      T pred_box_ymin = target_box_center_y - target_box_height / 2;
-      T pred_box_xmax = target_box_center_x + target_box_width / 2 - 1;
-      T pred_box_ymax = target_box_center_y + target_box_height / 2 - 1;
-      pred_box_xmin = pred_box_xmin / im_scale;
-      pred_box_ymin = pred_box_ymin / im_scale;
-      pred_box_xmax = pred_box_xmax / im_scale;
-      pred_box_ymax = pred_box_ymax / im_scale;
-
-      pred_box_xmin = std::max(std::min(pred_box_xmin, im_width - 1), zero);
-      pred_box_ymin = std::max(std::min(pred_box_ymin, im_height - 1), zero);
-      pred_box_xmax = std::max(std::min(pred_box_xmax, im_width - 1), zero);
-      pred_box_ymax = std::max(std::min(pred_box_ymax, im_height - 1), zero);
-
-      std::vector<T> one_pred;
-      one_pred.push_back(pred_box_xmin);
-      one_pred.push_back(pred_box_ymin);
-      one_pred.push_back(pred_box_xmax);
-      one_pred.push_back(pred_box_ymax);
-      one_pred.push_back(score);
-      (*preds)[c].push_back(one_pred);
-      i++;
-    }
-  }
-
-  void MultiClassNMS(const std::map<int, std::vector<std::vector<T>>>& preds,
-                     int class_num,
-                     const int keep_top_k,
-                     const T nms_threshold,
-                     const T nms_eta,
-                     std::vector<std::vector<T>>* nmsed_out,
-                     int* num_nmsed_out) const {
-    std::map<int, std::vector<int>> indices;
-    int num_det = 0;
-    for (int c = 0; c < class_num; ++c) {
-      if (static_cast<bool>(preds.count(c))) {
-        const std::vector<std::vector<T>> cls_dets = preds.at(c);
-        NMSFast(cls_dets, nms_threshold, nms_eta, &(indices[c]));
-        num_det += static_cast<int>(indices[c].size());
-      }
-    }
-
-    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-    for (const auto& it : indices) {
-      int label = it.first;
-      const std::vector<int>& label_indices = it.second;
-      for (auto idx : label_indices) {
-        score_index_pairs.push_back(std::make_pair(preds.at(label)[idx][4],
-                                                   std::make_pair(label, idx)));
-      }
-    }
-    // Keep top k results per image.
-    std::stable_sort(score_index_pairs.begin(),
-                     score_index_pairs.end(),
-                     SortScoreTwoPairDescend<int>);
-    if (num_det > keep_top_k) {
-      score_index_pairs.resize(keep_top_k);
-    }
-
-    // Store the new indices.
-    std::map<int, std::vector<int>> new_indices;
-    for (const auto& it : score_index_pairs) {
-      int label = it.second.first;
-      int idx = it.second.second;
-      std::vector<T> one_pred;
-      one_pred.push_back(label);
-      one_pred.push_back(preds.at(label)[idx][4]);
-      one_pred.push_back(preds.at(label)[idx][0]);
-      one_pred.push_back(preds.at(label)[idx][1]);
-      one_pred.push_back(preds.at(label)[idx][2]);
-      one_pred.push_back(preds.at(label)[idx][3]);
-      nmsed_out->push_back(one_pred);
-    }
-
-    *num_nmsed_out = (num_det > keep_top_k ? keep_top_k : num_det);
-  }
-
-  void RetinanetDetectionOutput(const framework::ExecutionContext& ctx,
-                                const std::vector<phi::DenseTensor>& scores,
-                                const std::vector<phi::DenseTensor>& bboxes,
-                                const std::vector<phi::DenseTensor>& anchors,
-                                const phi::DenseTensor& im_info,
-                                std::vector<std::vector<T>>* nmsed_out,
-                                int* num_nmsed_out) const {
-    int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
-    int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
-    T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
-    T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
-    T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
-
-    int64_t class_num = scores[0].dims()[1];
-    std::map<int, std::vector<std::vector<T>>> preds;
-    for (size_t l = 0; l < scores.size(); ++l) {
-      // Fetch per level score
-      phi::DenseTensor scores_per_level = scores[l];
-      // Fetch per level bbox
-      phi::DenseTensor bboxes_per_level = bboxes[l];
-      // Fetch per level anchor
-      phi::DenseTensor anchors_per_level = anchors[l];
-
-      int64_t scores_num = scores_per_level.numel();
-      int64_t bboxes_num = bboxes_per_level.numel();
-      std::vector<T> scores_data(scores_num);
-      std::vector<T> bboxes_data(bboxes_num);
-      std::vector<T> anchors_data(bboxes_num);
-      std::copy_n(scores_per_level.data<T>(), scores_num, scores_data.begin());
-      std::copy_n(bboxes_per_level.data<T>(), bboxes_num, bboxes_data.begin());
-      std::copy_n(
-          anchors_per_level.data<T>(), bboxes_num, anchors_data.begin());
-      std::vector<std::pair<T, int>> sorted_indices;
-
-      // For the highest level, we take the threshold 0.0
-      T threshold = (l < (scores.size() - 1) ? score_threshold : 0.0);
-      GetMaxScoreIndex(scores_data, threshold, nms_top_k, &sorted_indices);
-      auto* im_info_data = im_info.data<T>();
-      auto im_height = im_info_data[0];
-      auto im_width = im_info_data[1];
-      auto im_scale = im_info_data[2];
-      DeltaScoreToPrediction(bboxes_data,
-                             anchors_data,
-                             im_height,
-                             im_width,
-                             im_scale,
-                             class_num,
-                             sorted_indices,
-                             &preds);
-    }
-
-    MultiClassNMS(preds,
-                  class_num,
-                  keep_top_k,
-                  nms_threshold,
-                  nms_eta,
-                  nmsed_out,
-                  num_nmsed_out);
-  }
-
-  void MultiClassOutput(const platform::DeviceContext& ctx,
-                        const std::vector<std::vector<T>>& nmsed_out,
-                        phi::DenseTensor* outs) const {
-    auto* odata = outs->data<T>();
-    int count = 0;
-    int64_t out_dim = 6;
-    for (size_t i = 0; i < nmsed_out.size(); ++i) {
-      odata[count * out_dim] = nmsed_out[i][0] + 1;  // label
-      odata[count * out_dim + 1] = nmsed_out[i][1];  // score
-      odata[count * out_dim + 2] = nmsed_out[i][2];  // xmin
-      odata[count * out_dim + 3] = nmsed_out[i][3];  // xmin
-      odata[count * out_dim + 4] = nmsed_out[i][4];  // xmin
-      odata[count * out_dim + 5] = nmsed_out[i][5];  // xmin
-      count++;
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto boxes = ctx.MultiInput<phi::DenseTensor>("BBoxes");
-    auto scores = ctx.MultiInput<phi::DenseTensor>("Scores");
-    auto anchors = ctx.MultiInput<phi::DenseTensor>("Anchors");
-    auto* im_info = ctx.Input<phi::DenseTensor>("ImInfo");
-    auto* outs = ctx.Output<phi::DenseTensor>("Out");
-
-    std::vector<phi::DenseTensor> boxes_list(boxes.size());
-    std::vector<phi::DenseTensor> scores_list(scores.size());
-    std::vector<phi::DenseTensor> anchors_list(anchors.size());
-    for (size_t j = 0; j < boxes_list.size(); ++j) {
-      boxes_list[j] = *boxes[j];
-      scores_list[j] = *scores[j];
-      anchors_list[j] = *anchors[j];
-    }
-    auto score_dims = scores_list[0].dims();
-    int64_t batch_size = score_dims[0];
-    auto box_dims = boxes_list[0].dims();
-    int64_t box_dim = box_dims[2];
-    int64_t out_dim = box_dim + 2;
-
-    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
-
-    std::vector<std::vector<std::vector<T>>> all_nmsed_out;
-    std::vector<size_t> batch_starts = {0};
-    for (int i = 0; i < batch_size; ++i) {
-      int num_nmsed_out = 0;
-      std::vector<phi::DenseTensor> box_per_batch_list(boxes_list.size());
-      std::vector<phi::DenseTensor> score_per_batch_list(scores_list.size());
-      for (size_t j = 0; j < boxes_list.size(); ++j) {
-        const auto& score_dims = scores_list[j].dims();
-        score_per_batch_list[j] = scores_list[j].Slice(i, i + 1);
-        score_per_batch_list[j].Resize({score_dims[1], score_dims[2]});
-        box_per_batch_list[j] = boxes_list[j].Slice(i, i + 1);
-        box_per_batch_list[j].Resize({score_dims[1], box_dim});
-      }
-      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
-
-      std::vector<std::vector<T>> nmsed_out;
-      RetinanetDetectionOutput(ctx,
-                               score_per_batch_list,
-                               box_per_batch_list,
-                               anchors_list,
-                               im_info_slice,
-                               &nmsed_out,
-                               &num_nmsed_out);
-      all_nmsed_out.push_back(nmsed_out);
-      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-    }
-
-    int num_kept = static_cast<int>(batch_starts.back());
-    if (num_kept == 0) {
-      outs->Resize({0, out_dim});
-    } else {
-      outs->mutable_data<T>({num_kept, out_dim}, ctx.GetPlace());
-      for (int i = 0; i < batch_size; ++i) {
-        int64_t s = static_cast<int64_t>(batch_starts[i]);
-        int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
-        if (e > s) {
-          phi::DenseTensor out = outs->Slice(s, e);
-          MultiClassOutput(dev_ctx, all_nmsed_out[i], &out);
-        }
-      }
-    }
-
-    framework::LoD lod;
-    lod.emplace_back(batch_starts);
-
-    outs->set_lod(lod);
-  }
-};
-
-class RetinanetDetectionOutputOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("BBoxes",
-             "(List) A list of tensors from multiple FPN levels. Each "
-             "element is a 3-D phi::DenseTensor with shape [N, Mi, 4] "
-             "represents the "
-             "predicted locations of Mi bounding boxes, N is the batch size. "
-             "Mi is the number of bounding boxes from i-th FPN level. Each "
-             "bounding box has four coordinate values and the layout is "
-             "[xmin, ymin, xmax, ymax].")
-        .AsDuplicable();
-    AddInput("Scores",
-             "(List) A list of tensors from multiple FPN levels. Each "
-             "element is a 3-D phi::DenseTensor with shape [N, Mi, C] "
-             "represents the "
-             "predicted confidence from its FPN level. N is the batch size, "
-             "C is the class number (excluding background), Mi is the number "
-             "of bounding boxes from i-th FPN level. For each bounding box, "
-             "there are total C scores.")
-        .AsDuplicable();
-    AddInput(
-        "Anchors",
-        "(List) A list of tensors from multiple FPN levels. Each"
-        "element is a 2-D phi::DenseTensor with shape [Mi, 4] represents the "
-        "locations of Mi anchor boxes from i-th FPN level. Each "
-        "bounding box has four coordinate values and the layout is "
-        "[xmin, ymin, xmax, ymax].")
-        .AsDuplicable();
-    AddInput("ImInfo",
-             "(phi::DenseTensor) A 2-D phi::DenseTensor with shape [N, 3] "
-             "represents the "
-             "image information. N is the batch size, each image information "
-             "includes height, width and scale.");
-    AddAttr<float>("score_threshold",
-                   "(float) "
-                   "Threshold to filter out bounding boxes with a confidence "
-                   "score.");
-    AddAttr<int>("nms_top_k",
-                 "(int64_t) "
-                 "Maximum number of detections per FPN layer to be kept "
-                 "according to the confidence before NMS.");
-    AddAttr<float>("nms_threshold",
-                   "(float) "
-                   "The threshold to be used in NMS.");
-    AddAttr<float>("nms_eta",
-                   "(float) "
-                   "The parameter for adaptive NMS.");
-    AddAttr<int>(
-        "keep_top_k",
-        "(int64_t) "
-        "Number of total bounding boxes to be kept per image after NMS "
-        "step.");
-    AddOutput("Out",
-              "(phi::DenseTensor) A 2-D phi::DenseTensor with shape [No, 6] "
-              "represents the "
-              "detections. Each row has 6 values: "
-              "[label, confidence, xmin, ymin, xmax, ymax]"
-              "No is the total number of detections in this mini-batch."
-              "For each instance, "
-              "the offsets in first dimension are called LoD, the number of "
-              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
-              "no detected bbox.");
-    AddComment(R"DOC(
-This operator is to decode boxes and scores from each FPN layer and do
-multi-class non maximum suppression (NMS) on merged predictions.
-
-Top-scoring predictions per FPN layer are decoded with the anchor
-information. This operator greedily selects a subset of detection bounding
-boxes from each FPN layer that have high scores larger than score_threshold,
-if providing this threshold, then selects the largest nms_top_k confidences
-scores per FPN layer, if nms_top_k is larger than -1.
-The decoding schema is described below:
-
-ox = (pw * pxv * tx * + px) - tw / 2
-
-oy = (ph * pyv * ty * + py) - th / 2
-
-ow = exp(pwv * tw) * pw + tw / 2
-
-oh = exp(phv * th) * ph + th / 2
-
-where `tx`, `ty`, `tw`, `th` denote the predicted box's center coordinates, width
-and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
-anchor's center coordinates, width and height. `pxv`, `pyv`, `pwv`,
-`phv` denote the variance of the anchor box and `ox`, `oy`, `ow`, `oh` denote the
-decoded coordinates, width and height.
-
-Then the top decoded prediction from all levels are merged followed by NMS.
-In the NMS step, this operator prunes away boxes that have high IOU
-(intersection over union) overlap with already selected boxes by adaptive
-threshold NMS based on parameters of nms_threshold and nms_eta.
-After NMS step, at most keep_top_k number of total bounding boxes are to be kept
-per image if keep_top_k is larger than -1.
-This operator support multi-class and batched inputs. It applying NMS
-independently for each class. The outputs is a 2-D LoDTensor, for each
-image, the offsets in first dimension of phi::DenseTensor are called LoD, the number
-of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
-means there is no detected bounding box for this image. If there is no detected boxes
-for all images, all the elements in LoD are set to 0, and the output tensor is
-empty (None).
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    retinanet_detection_output,
-    ops::RetinanetDetectionOutputOp,
-    ops::RetinanetDetectionOutputOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-PD_REGISTER_STRUCT_KERNEL(retinanet_detection_output,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::RetinanetDetectionOutputKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
deleted file mode 100644
index d3c315b7bdfc5..0000000000000
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ /dev/null
@@ -1,1262 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-class RpnTargetAssignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Anchor"), "Input", "Anchor", "rpn_target_assign");
-    OP_INOUT_CHECK(
-        ctx->HasInput("GtBoxes"), "Input", "GtBoxes", "rpn_target_assign");
-    OP_INOUT_CHECK(
-        ctx->HasInput("IsCrowd"), "Input", "IsCrowd", "rpn_target_assign");
-    OP_INOUT_CHECK(
-        ctx->HasInput("ImInfo"), "Input", "ImInfo", "rpn_target_assign");
-
-    OP_INOUT_CHECK(ctx->HasOutput("LocationIndex"),
-                   "Output",
-                   "LocationIndex",
-                   "rpn_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("ScoreIndex"),
-                   "Output",
-                   "ScoreIndex",
-                   "rpn_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("TargetLabel"),
-                   "Output",
-                   "TargetLabel",
-                   "rpn_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("TargetBBox"),
-                   "Output",
-                   "TargetBBox",
-                   "rpn_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("BBoxInsideWeight"),
-                   "Output",
-                   "BBoxInsideWeight",
-                   "rpn_target_assign");
-
-    auto anchor_dims = ctx->GetInputDim("Anchor");
-    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-    PADDLE_ENFORCE_EQ(anchor_dims.size(),
-                      2,
-                      phi::errors::InvalidArgument(
-                          "The dimensions size of Input(Anchor) must be 2. But "
-                          "received dimensions size=[%d], dimensions=[%s].",
-                          anchor_dims.size(),
-                          anchor_dims));
-    PADDLE_ENFORCE_EQ(gt_boxes_dims.size(),
-                      2,
-                      phi::errors::InvalidArgument(
-                          "The dimensions size of Input(GtBoxes) must be 2. "
-                          "But received dimensions size=[%d], dimensions=[%s].",
-                          gt_boxes_dims.size(),
-                          gt_boxes_dims));
-    PADDLE_ENFORCE_EQ(im_info_dims.size(),
-                      2,
-                      phi::errors::InvalidArgument(
-                          "The dimensions size of Input(ImInfo) must be 2. But "
-                          "received dimensions size=[%d], dimensions=[%s].",
-                          im_info_dims.size(),
-                          im_info_dims));
-
-    ctx->SetOutputDim("LocationIndex", {-1});
-    ctx->SetOutputDim("ScoreIndex", {-1});
-    ctx->SetOutputDim("TargetLabel", {-1, 1});
-    ctx->SetOutputDim("TargetBBox", {-1, 4});
-    ctx->SetOutputDim("BBoxInsideWeight", {-1, 4});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Anchor"),
-        platform::CPUPlace());
-  }
-};
-
-template <typename T>
-void AppendRpns(phi::DenseTensor* out,
-                int64_t offset,
-                phi::DenseTensor* to_add) {
-  auto* out_data = out->data<T>();
-  auto* to_add_data = to_add->data<T>();
-  memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
-}
-
-template <typename T>
-std::vector<phi::DenseTensor> FilterStraddleAnchor(
-    const phi::CPUContext& context,
-    const phi::DenseTensor* anchor,
-    const float rpn_straddle_thresh,
-    T im_height,
-    T im_width) {
-  std::vector<int> inds_inside;
-  int anchor_num = static_cast<int>(anchor->dims()[0]);
-  auto* anchor_data = anchor->data<T>();
-  if (rpn_straddle_thresh >= 0) {
-    int index = 0;
-    for (int i = 0; i < anchor_num; ++i) {
-      index = i * 4;
-      if ((anchor_data[index + 0] >= -rpn_straddle_thresh) &&
-          (anchor_data[index + 1] >= -rpn_straddle_thresh) &&
-          (anchor_data[index + 2] < im_width + rpn_straddle_thresh) &&
-          (anchor_data[index + 3] < im_height + rpn_straddle_thresh)) {
-        inds_inside.emplace_back(i);
-      }
-    }
-  } else {
-    for (int i = 0; i < anchor_num; ++i) {
-      inds_inside.emplace_back(i);
-    }
-  }
-  int inside_num = static_cast<int>(inds_inside.size());
-  phi::DenseTensor inds_inside_t;
-  int* inds_inside_data =
-      inds_inside_t.mutable_data<int>({inside_num}, context.GetPlace());
-  std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data);
-  phi::DenseTensor inside_anchor_t;
-  T* inside_anchor_data =
-      inside_anchor_t.mutable_data<T>({inside_num, 4}, context.GetPlace());
-  Gather<T>(
-      anchor->data<T>(), 4, inds_inside_data, inside_num, inside_anchor_data);
-  std::vector<phi::DenseTensor> res;
-  res.emplace_back(inds_inside_t);
-  res.emplace_back(inside_anchor_t);
-  return res;
-}
-
-template <typename T>
-phi::DenseTensor FilterCrowdGt(const phi::CPUContext& context,
-                               phi::DenseTensor* gt_boxes,
-                               phi::DenseTensor* is_crowd) {
-  int gt_num = static_cast<int>(gt_boxes->dims()[0]);
-  std::vector<int> not_crowd_inds;
-  auto* is_crowd_data = is_crowd->data<int>();
-  for (int i = 0; i < gt_num; ++i) {
-    if (is_crowd_data[i] == 0) {
-      not_crowd_inds.emplace_back(i);
-    }
-  }
-  int ncrowd_num = static_cast<int>(not_crowd_inds.size());
-  phi::DenseTensor ncrowd_gt_boxes;
-  T* ncrowd_gt_boxes_data =
-      ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
-  Gather<T>(gt_boxes->data<T>(),
-            4,
-            not_crowd_inds.data(),
-            ncrowd_num,
-            ncrowd_gt_boxes_data);
-  return ncrowd_gt_boxes;
-}
-
-void ReservoirSampling(const int num,
-                       std::vector<int>* inds,
-                       std::minstd_rand engine,
-                       bool use_random) {
-  std::uniform_real_distribution<float> uniform(0, 1);
-  int len = static_cast<int>(inds->size());
-  if (len > num) {
-    if (use_random) {
-      for (int i = num; i < len; ++i) {
-        int rng_ind = std::floor(uniform(engine) * i);  // NOLINT
-        if (rng_ind < num)
-          std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
-      }
-    }
-    inds->resize(num);
-  }
-}
-
-template <typename T>
-void ScoreAssign(const T* anchor_by_gt_overlap_data,
-                 const phi::DenseTensor& anchor_to_gt_max,
-                 const phi::DenseTensor& gt_to_anchor_max,
-                 const int rpn_batch_size_per_im,
-                 const float rpn_fg_fraction,
-                 const float rpn_positive_overlap,
-                 const float rpn_negative_overlap,
-                 std::vector<int>* fg_inds,
-                 std::vector<int>* bg_inds,
-                 std::vector<int>* tgt_lbl,
-                 std::vector<int>* fg_fake,
-                 std::vector<T>* bbox_inside_weight,
-                 std::minstd_rand engine,
-                 bool use_random) {
-  float epsilon = 0.00001;
-  int anchor_num = static_cast<int>(anchor_to_gt_max.dims()[0]);
-  int gt_num = static_cast<int>(gt_to_anchor_max.dims()[0]);
-  std::vector<int> target_label(anchor_num, -1);
-  std::vector<int> fg_inds_fake;
-  std::vector<int> bg_inds_fake;
-  const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
-  const T* gt_to_anchor_max_data = gt_to_anchor_max.data<T>();
-  // TODO(buxingyuan): Match with Detectron now
-  // but it seems here is a bug in two directions assignment
-  // in which the later one may overwrites the former one.
-  for (int64_t i = 0; i < anchor_num; ++i) {
-    bool is_anchors_with_max_overlap = false;
-    for (int64_t j = 0; j < gt_num; ++j) {
-      T value = anchor_by_gt_overlap_data[i * gt_num + j];
-      T diff = std::abs(value - gt_to_anchor_max_data[j]);
-      if (diff < epsilon) {
-        is_anchors_with_max_overlap = true;
-        break;
-      }
-    }
-    bool is_anchor_great_than_thresh =
-        (anchor_to_gt_max_data[i] >= rpn_positive_overlap);
-    if (is_anchors_with_max_overlap || is_anchor_great_than_thresh) {
-      fg_inds_fake.push_back(i);  // NOLINT
-    }
-  }
-
-  // Reservoir Sampling
-  int fg_num = 0;
-  if (rpn_fg_fraction > 0 && rpn_batch_size_per_im > 0) {
-    fg_num =
-        static_cast<int>(rpn_fg_fraction * rpn_batch_size_per_im);  // NOLINT
-    ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random);
-  } else {
-    fg_num = static_cast<int>(fg_inds_fake.size());
-  }
-  int fg_fake_num = static_cast<int>(fg_inds_fake.size());
-  for (int64_t i = 0; i < fg_fake_num; ++i) {
-    target_label[fg_inds_fake[i]] = 1;
-  }
-
-  for (int64_t i = 0; i < anchor_num; ++i) {
-    if (anchor_to_gt_max_data[i] < rpn_negative_overlap) {
-      bg_inds_fake.push_back(i);  // NOLINT
-    }
-  }
-  int bg_num = 0;
-  if (rpn_fg_fraction > 0 && rpn_batch_size_per_im > 0) {
-    bg_num = rpn_batch_size_per_im - fg_fake_num;
-    ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random);
-    bg_num = static_cast<int>(bg_inds_fake.size());
-  } else {
-    bg_num = static_cast<int>(bg_inds_fake.size());
-  }
-
-  int fake_num = 0;
-  for (int64_t i = 0; i < bg_num; ++i) {
-    // fg fake found
-    if (target_label[bg_inds_fake[i]] == 1) {
-      fake_num++;
-      fg_fake->emplace_back(fg_inds_fake[0]);
-      for (int j = 0; j < 4; ++j) {
-        bbox_inside_weight->emplace_back(T(0.));
-      }
-    }
-    target_label[bg_inds_fake[i]] = 0;
-  }
-
-  for (int64_t i = 0; i < (fg_fake_num - fake_num) * 4; ++i) {
-    bbox_inside_weight->emplace_back(T(1.));
-  }
-
-  for (int64_t i = 0; i < anchor_num; ++i) {
-    if (target_label[i] == 1) {
-      fg_inds->emplace_back(i);
-      fg_fake->emplace_back(i);
-    }
-    if (target_label[i] == 0) bg_inds->emplace_back(i);
-  }
-  fg_num = static_cast<int>(fg_inds->size());
-  bg_num = static_cast<int>(bg_inds->size());
-
-  tgt_lbl->resize(fg_num + bg_num, 0);
-  std::vector<int> fg_lbl(fg_num, 1);
-  std::vector<int> bg_lbl(bg_num, 0);
-  std::copy(fg_lbl.begin(), fg_lbl.end(), tgt_lbl->data());
-  std::copy(bg_lbl.begin(), bg_lbl.end(), tgt_lbl->data() + fg_num);
-}
-
-template <typename T>
-std::vector<phi::DenseTensor> SampleRpnFgBgGt(
-    const phi::CPUContext& ctx,
-    const phi::DenseTensor& anchor_by_gt_overlap,
-    const int rpn_batch_size_per_im,
-    const float rpn_positive_overlap,
-    const float rpn_negative_overlap,
-    const float rpn_fg_fraction,
-    std::minstd_rand engine,
-    bool use_random) {
-  auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data<T>();
-  int anchor_num = static_cast<int>(anchor_by_gt_overlap.dims()[0]);
-  int gt_num = static_cast<int>(anchor_by_gt_overlap.dims()[1]);
-
-  std::vector<int> fg_inds;
-  std::vector<int> bg_inds;
-  std::vector<int> gt_inds;
-  std::vector<int> tgt_lbl;
-  std::vector<int> fg_fake;
-  std::vector<T> bbox_inside_weight;
-  // Calculate the max IoU between anchors and gt boxes
-  // Map from anchor to gt box that has highest overlap
-  auto place = ctx.GetPlace();
-  phi::DenseTensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
-  anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
-  int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
-  gt_to_anchor_max.mutable_data<T>({gt_num}, place);
-
-  auto anchor_by_gt_overlap_et =
-      framework::EigenMatrix<T>::From(anchor_by_gt_overlap);
-  auto anchor_to_gt_max_et =
-      framework::EigenVector<T>::Flatten(anchor_to_gt_max);
-  auto gt_to_anchor_max_et =
-      framework::EigenVector<T>::Flatten(gt_to_anchor_max);
-  auto anchor_to_gt_argmax_et =
-      framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
-  anchor_to_gt_max_et =
-      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(1));
-  anchor_to_gt_argmax_et =
-      anchor_by_gt_overlap_et.argmax(1).template cast<int>();
-  gt_to_anchor_max_et =
-      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(0));
-
-  // Follow the Faster RCNN's implementation
-  ScoreAssign(anchor_by_gt_overlap_data,
-              anchor_to_gt_max,
-              gt_to_anchor_max,
-              rpn_batch_size_per_im,
-              rpn_fg_fraction,
-              rpn_positive_overlap,
-              rpn_negative_overlap,
-              &fg_inds,
-              &bg_inds,
-              &tgt_lbl,
-              &fg_fake,
-              &bbox_inside_weight,
-              engine,
-              use_random);
-
-  int fg_num = static_cast<int>(fg_inds.size());
-  int bg_num = static_cast<int>(bg_inds.size());
-  int fg_fake_num = static_cast<int>(fg_fake.size());
-  gt_inds.reserve(fg_fake_num);
-  for (int i = 0; i < fg_fake_num; ++i) {
-    gt_inds.emplace_back(argmax[fg_fake[i]]);
-  }
-  phi::DenseTensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t,
-      bbox_inside_weight_t;
-  int* loc_index_data = loc_index_t.mutable_data<int>({fg_fake_num}, place);
-  int* score_index_data =
-      score_index_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* tgt_lbl_data = tgt_lbl_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_fake_num}, place);
-  T* bbox_inside_weight_data =
-      bbox_inside_weight_t.mutable_data<T>({fg_fake_num, 4}, place);
-  std::copy(fg_fake.begin(), fg_fake.end(), loc_index_data);
-  std::copy(fg_inds.begin(), fg_inds.end(), score_index_data);
-  std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num);
-  std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data);
-  std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data);
-  std::copy(bbox_inside_weight.begin(),
-            bbox_inside_weight.end(),
-            bbox_inside_weight_data);
-  std::vector<phi::DenseTensor> loc_score_tgtlbl_gt;
-  loc_score_tgtlbl_gt.emplace_back(loc_index_t);
-  loc_score_tgtlbl_gt.emplace_back(score_index_t);
-  loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
-  loc_score_tgtlbl_gt.emplace_back(gt_inds_t);
-  loc_score_tgtlbl_gt.emplace_back(bbox_inside_weight_t);
-
-  return loc_score_tgtlbl_gt;
-}
-
-template <typename T, typename DeviceContext>
-class RpnTargetAssignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* anchor = context.Input<phi::DenseTensor>("Anchor");  // (H*W*A) * 4
-    auto* gt_boxes = context.Input<phi::DenseTensor>("GtBoxes");
-    auto* is_crowd = context.Input<phi::DenseTensor>("IsCrowd");
-    auto* im_info = context.Input<phi::DenseTensor>("ImInfo");
-
-    auto* loc_index = context.Output<phi::DenseTensor>("LocationIndex");
-    auto* score_index = context.Output<phi::DenseTensor>("ScoreIndex");
-    auto* tgt_bbox = context.Output<phi::DenseTensor>("TargetBBox");
-    auto* tgt_lbl = context.Output<phi::DenseTensor>("TargetLabel");
-    auto* bbox_inside_weight =
-        context.Output<phi::DenseTensor>("BBoxInsideWeight");
-
-    PADDLE_ENFORCE_EQ(gt_boxes->lod().size(),
-                      1UL,
-                      phi::errors::InvalidArgument(
-                          "RpnTargetAssignOp gt_boxes needs 1 level of LoD. "
-                          "But received level of LoD is [%d], LoD is [%s].",
-                          gt_boxes->lod().size(),
-                          gt_boxes->lod()));
-    PADDLE_ENFORCE_EQ(is_crowd->lod().size(),
-                      1UL,
-                      phi::errors::InvalidArgument(
-                          "RpnTargetAssignOp is_crowd needs 1 level of LoD. "
-                          "But received level of LoD is [%d], LoD is [%s].",
-                          is_crowd->lod().size(),
-                          is_crowd->lod()));
-    int64_t anchor_num = static_cast<int64_t>(anchor->dims()[0]);
-    int64_t batch_num = static_cast<int64_t>(gt_boxes->lod().back().size() - 1);
-
-    int rpn_batch_size_per_im = context.Attr<int>("rpn_batch_size_per_im");
-    float rpn_straddle_thresh = context.Attr<float>("rpn_straddle_thresh");
-    float rpn_positive_overlap = context.Attr<float>("rpn_positive_overlap");
-    float rpn_negative_overlap = context.Attr<float>("rpn_negative_overlap");
-    float rpn_fg_fraction = context.Attr<float>("rpn_fg_fraction");
-    bool use_random = context.Attr<bool>("use_random");
-
-    int64_t max_num = batch_num * rpn_batch_size_per_im;
-    auto place = context.GetPlace();
-
-    loc_index->mutable_data<int>({max_num}, place);
-    score_index->mutable_data<int>({max_num}, place);
-    tgt_bbox->mutable_data<T>({max_num, 4}, place);
-    tgt_lbl->mutable_data<int>({max_num, 1}, place);
-    bbox_inside_weight->mutable_data<T>({max_num, 4}, place);
-    auto& dev_ctx = context.device_context<phi::CPUContext>();
-
-    std::random_device rnd;
-    std::minstd_rand engine;
-    int seed = static_cast<int>(rnd());
-    engine.seed(seed);
-
-    framework::LoD lod_loc, loc_score;
-    std::vector<size_t> lod0_loc(1, 0);
-    std::vector<size_t> lod0_score(1, 0);
-
-    int total_loc_num = 0;
-    int total_score_num = 0;
-    auto gt_boxes_lod = gt_boxes->lod().back();
-    auto is_crowd_lod = is_crowd->lod().back();
-    for (int i = 0; i < batch_num; ++i) {
-      phi::DenseTensor gt_boxes_slice =
-          gt_boxes->Slice(static_cast<int64_t>(gt_boxes_lod[i]),
-                          static_cast<int64_t>(gt_boxes_lod[i + 1]));
-      phi::DenseTensor is_crowd_slice =
-          is_crowd->Slice(static_cast<int64_t>(is_crowd_lod[i]),
-                          static_cast<int64_t>(is_crowd_lod[i + 1]));
-      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
-      auto* im_info_data = im_info_slice.data<T>();
-      auto im_height = im_info_data[0];
-      auto im_width = im_info_data[1];
-      auto im_scale = im_info_data[2];
-
-      // Filter straddle anchor
-      std::vector<phi::DenseTensor> filter_output = FilterStraddleAnchor<T>(
-          dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width);
-      phi::DenseTensor inds_inside = filter_output[0];
-      phi::DenseTensor inside_anchor = filter_output[1];
-
-      // Filter crowd gt
-      phi::DenseTensor ncrowd_gt_boxes =
-          FilterCrowdGt<T>(dev_ctx, &gt_boxes_slice, &is_crowd_slice);
-      auto ncrowd_gt_boxes_et =
-          framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
-      ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
-
-      phi::DenseTensor anchor_by_gt_overlap;
-      anchor_by_gt_overlap.mutable_data<T>(
-          {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
-      BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
-
-      auto loc_score_tgtlbl_gt = SampleRpnFgBgGt<T>(dev_ctx,
-                                                    anchor_by_gt_overlap,
-                                                    rpn_batch_size_per_im,
-                                                    rpn_positive_overlap,
-                                                    rpn_negative_overlap,
-                                                    rpn_fg_fraction,
-                                                    engine,
-                                                    use_random);
-
-      phi::DenseTensor sampled_loc_index = loc_score_tgtlbl_gt[0];
-      phi::DenseTensor sampled_score_index = loc_score_tgtlbl_gt[1];
-      phi::DenseTensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
-      phi::DenseTensor sampled_gt_index = loc_score_tgtlbl_gt[3];
-      phi::DenseTensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
-
-      int loc_num = static_cast<int>(sampled_loc_index.dims()[0]);
-      int score_num = static_cast<int>(sampled_score_index.dims()[0]);
-      // unmap to all anchor
-      phi::DenseTensor sampled_loc_index_unmap, sampled_score_index_unmap;
-      sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
-      sampled_score_index_unmap.mutable_data<int>({score_num}, place);
-      Gather<int>(inds_inside.data<int>(),
-                  1,
-                  sampled_loc_index.data<int>(),
-                  loc_num,
-                  sampled_loc_index_unmap.data<int>());
-      Gather<int>(inds_inside.data<int>(),
-                  1,
-                  sampled_score_index.data<int>(),
-                  score_num,
-                  sampled_score_index_unmap.data<int>());
-
-      // get target bbox deltas
-      phi::DenseTensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
-      auto* sampled_anchor_data =
-          sampled_anchor.mutable_data<T>({loc_num, 4}, place);
-      auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
-      Gather<T>(anchor->data<T>(),
-                4,
-                sampled_loc_index_unmap.data<int>(),
-                loc_num,
-                sampled_anchor_data);
-      Gather<T>(ncrowd_gt_boxes.data<T>(),
-                4,
-                sampled_gt_index.data<int>(),
-                loc_num,
-                sampled_gt_data);
-      sampled_tgt_bbox.mutable_data<T>({loc_num, 4}, place);
-      BoxToDelta<T>(loc_num,
-                    sampled_anchor,
-                    sampled_gt,
-                    nullptr,
-                    false,
-                    &sampled_tgt_bbox);
-
-      // Add anchor offset
-      int anchor_offset = static_cast<int>(i * anchor_num);
-      auto sampled_loc_index_unmap_et =
-          framework::EigenTensor<int, 1>::From(sampled_loc_index_unmap);
-      sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset;
-      auto sampled_score_index_unmap_et =
-          framework::EigenTensor<int, 1>::From(sampled_score_index_unmap);
-      sampled_score_index_unmap_et =
-          sampled_score_index_unmap_et + anchor_offset;
-      AppendRpns<int>(loc_index, total_loc_num, &sampled_loc_index_unmap);
-      AppendRpns<int>(score_index, total_score_num, &sampled_score_index_unmap);
-      AppendRpns<T>(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox);
-      AppendRpns<int>(tgt_lbl, total_score_num, &sampled_tgtlbl);
-      AppendRpns<T>(
-          bbox_inside_weight, total_loc_num * 4, &sampled_bbox_inside_weight);
-      total_loc_num += loc_num;
-
-      total_score_num += score_num;
-      lod0_loc.emplace_back(total_loc_num);
-      lod0_score.emplace_back(total_score_num);
-    }
-
-    PADDLE_ENFORCE_LE(
-        total_loc_num,
-        max_num,
-        phi::errors::InvalidArgument(
-            "The number of sampled bboxes should not be greater than the "
-            "number of all anchor boxes(%d), but the number of sampled "
-            "bboxes is :%d.",
-            max_num,
-            total_loc_num));
-    PADDLE_ENFORCE_LE(
-        total_score_num,
-        max_num,
-        phi::errors::InvalidArgument(
-            "The number of sampled scores should not be greater than the "
-            "number of all anchor boxes(%d), but the number of sampled "
-            "scores is :%d.",
-            max_num,
-            total_score_num));
-
-    lod_loc.emplace_back(lod0_loc);
-    loc_score.emplace_back(lod0_score);
-    loc_index->set_lod(lod_loc);
-    score_index->set_lod(loc_score);
-    tgt_bbox->set_lod(lod_loc);
-    tgt_lbl->set_lod(loc_score);
-    bbox_inside_weight->set_lod(lod_loc);
-    loc_index->Resize({total_loc_num});
-    score_index->Resize({total_score_num});
-    tgt_bbox->Resize({total_loc_num, 4});
-    tgt_lbl->Resize({total_score_num, 1});
-    bbox_inside_weight->Resize({total_loc_num, 4});
-  }
-};
-
-class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Anchor",
-             "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
-    AddInput("GtBoxes",
-             "(phi::DenseTensor) input ground-truth bbox with shape [K, 4].");
-    AddInput("IsCrowd",
-             "(phi::DenseTensor) input which indicates ground-truth is crowd.");
-    AddInput("ImInfo",
-             "(phi::DenseTensor) input image information with shape [N, 3]. "
-             "N is the batch size, each image information includes height, "
-             "width and scale.");
-    AddAttr<int>("rpn_batch_size_per_im",
-                 "Total number of RPN examples per image.")
-        .SetDefault(256);
-    AddAttr<float>(
-        "rpn_straddle_thresh",
-        "Remove RPN anchors that go outside the image by straddle_thresh "
-        "pixels, "
-        "Set to -1 or a large value, e.g. 100000, to disable pruning anchors.");
-    AddAttr<float>(
-        "rpn_positive_overlap",
-        "Minimum overlap required between an anchor and ground-truth "
-        "box for the (anchor, gt box) pair to be a positive example.")
-        .SetDefault(0.7);
-    AddAttr<float>(
-        "rpn_negative_overlap",
-        "Maximum overlap allowed between an anchor and ground-truth "
-        "box for the (anchor, gt box) pair to be a negative examples.")
-        .SetDefault(0.3);
-    AddAttr<float>(
-        "rpn_fg_fraction",
-        "Target fraction of RoI minibatch that "
-        "is labeled foreground (i.e. class > 0), 0-th class is background.")
-        .SetDefault(0.25);
-    AddAttr<bool>("use_random",
-                  "A flag indicating whether to use a ReservoirSampling. "
-                  "NOTE: DO NOT set this flag to false in training. "
-                  "Setting this flag to false is only useful in unittest.")
-        .SetDefault(true);
-    AddOutput(
-        "LocationIndex",
-        "(Tensor), The indexes of foreground anchors in all RPN anchors, the "
-        "shape of the LocationIndex is [F], F depends on the value of input "
-        "tensor and attributes.");
-    AddOutput(
-        "ScoreIndex",
-        "(Tensor), The indexes of foreground and background anchors in all "
-        "RPN anchors(The rest anchors are ignored). The shape of the "
-        "ScoreIndex is [F + B], F and B are sampled foreground and background "
-        " number.");
-    AddOutput("TargetBBox",
-              "(Tensor), The target bbox deltas with shape "
-              "[F, 4], F is the sampled foreground number.");
-    AddOutput(
-        "TargetLabel",
-        "(Tensor<int>), The target labels of each anchor with shape "
-        "[F + B, 1], F and B are sampled foreground and background number.");
-    AddOutput("BBoxInsideWeight",
-              "(Tensor), The bbox inside weight with shape "
-              "[F, 4], F is the sampled foreground number.");
-    AddComment(R"DOC(
-This operator can be, for a given set of ground truth bboxes and the
-anchors, to assign classification and regression targets to each prediction.
-The ScoreIndex and LocationIndex will be generated according to the anchor-groundtruth IOU.
-The rest anchors would not contibute to the RPN training loss
-
-ScoreIndex is composed of foreground anchor indexes(positive labels) and
-background anchor indexes(negative labels). LocationIndex is exactly same
-as the foreground anchor indexes since we can not assign regression target to
-the background anchors.
-
-The classification targets(TargetLabel) is a binary class label (of being
-an object or not). Following the paper of Faster-RCNN, the positive labels
-are two kinds of anchors: (i) the anchor/anchors with the highest IoU
-overlap with a ground-truth box, or (ii) an anchor that has an IoU overlap
-higher than rpn_positive_overlap(0.7) with any ground-truth box. Note that
-a single ground-truth box may assign positive labels to multiple anchors.
-A non-positive anchor is when its IoU ratio is lower than rpn_negative_overlap
-(0.3) for all ground-truth boxes. Anchors that are neither positive nor
-negative do not contribute to the training objective.
-
-)DOC");
-  }
-};
-
-class RetinanetTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Anchor",
-             "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
-    AddInput("GtBoxes",
-             "(phi::DenseTensor) input ground-truth bbox with shape [K, 4].");
-    AddInput("GtLabels",
-             "(phi::DenseTensor) input ground-truth label with shape [K, 1].");
-    AddInput("IsCrowd",
-             "(phi::DenseTensor) input which indicates ground-truth is crowd.");
-    AddInput("ImInfo",
-             "(phi::DenseTensor) input image information with shape [N, 3]. "
-             "N is the batch size, each image information includes height, "
-             "width and scale.");
-    AddAttr<float>(
-        "positive_overlap",
-        "Minimum overlap required between an anchor and ground-truth "
-        "box for the (anchor, gt box) pair to be a positive example.")
-        .SetDefault(0.5);
-    AddAttr<float>(
-        "negative_overlap",
-        "Maximum overlap allowed between an anchor and ground-truth "
-        "box for the (anchor, gt box) pair to be a negative examples.")
-        .SetDefault(0.4);
-    AddOutput(
-        "LocationIndex",
-        "(Tensor), The indexes of foreground anchors in all anchors, the "
-        "shape of the LocationIndex is [F], F depends on the value of input "
-        "tensor and attributes.");
-    AddOutput(
-        "ScoreIndex",
-        "(Tensor), The indexes of foreground and background anchors in all "
-        "RPN anchors(The rest anchors are ignored). The shape of the "
-        "ScoreIndex is [F + B], F and B are foreground and background "
-        " number.");
-    AddOutput("TargetBBox",
-              "(Tensor), The target bbox deltas with shape "
-              "[F, 4], F is the foreground number.");
-    AddOutput("TargetLabel",
-              "(Tensor<int>), The target labels of each anchor with shape "
-              "[F + B, 1], F and B are foreground and background number.");
-    AddOutput("BBoxInsideWeight",
-              "(Tensor), The bbox inside weight with shape "
-              "[F, 4], F is the foreground number.");
-    AddOutput("ForegroundNumber",
-              "(Tensor), The foreground number. "
-              "[1, 1].");
-    AddComment(R"DOC(
-    This layer can be, for given the Intersection-over-Union (IoU) overlap
-    between anchors and ground truth boxes, to assign classification and
-    regression targets to each anchor, these target labels are used for
-    train retinanet.
-
-    Every anchor is assigned with a length C one-hot vector of
-    classification targets, and a 4-vector of box regression targets,
-    where C is the class number. The assignment rules are as followed:
-
-    1. Anchors are assigned to ground-truth boxes when: (i) it has the highest
-    IoU overlap with a ground-truth box, or (ii) it has an IoU overlap higher
-    than positive_overlap(0.5) with any ground-truth box.
-
-    2. Anchors are assigned to background when its IoU ratio is lower than
-    negative_overlap (0.4) for all ground-truth boxes.
-
-    When an anchor is assigned with a ground-truth box which is the i-th category,
-    the i-th entry in its C vector of targets is set to 1 and all other entries
-    are set to 0. When an anchor is assigned with background, all entries are set
-    to 0. Anchors that are not assigned do not contribute to the training
-    objective. The regression targets are the encoded ground-truth boxes
-    associated with the assigned anchors.
-
-)DOC");
-  }
-};
-
-class RetinanetTargetAssignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Anchor"), "Input", "Anchor", "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasInput("GtBoxes"),
-                   "Input",
-                   "GtBoxes",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasInput("GtLabels"),
-                   "Input",
-                   "GtLabels",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasInput("IsCrowd"),
-                   "Input",
-                   "IsCrowd",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(
-        ctx->HasInput("ImInfo"), "Input", "ImInfo", "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("LocationIndex"),
-                   "Output",
-                   "LocationIndex",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("ScoreIndex"),
-                   "Output",
-                   "ScoreIndex",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("TargetLabel"),
-                   "Output",
-                   "TargetLabel",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("TargetBBox"),
-                   "Output",
-                   "TargetBBox",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("BBoxInsideWeight"),
-                   "Output",
-                   "BBoxInsideWeight",
-                   "retinanet_target_assign");
-    OP_INOUT_CHECK(ctx->HasOutput("ForegroundNumber"),
-                   "Output",
-                   "ForegroundNumber",
-                   "retinanet_target_assign");
-
-    auto anchor_dims = ctx->GetInputDim("Anchor");
-    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto gt_labels_dims = ctx->GetInputDim("GtLabels");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-
-    PADDLE_ENFORCE_EQ(
-        anchor_dims.size(),
-        2,
-        phi::errors::InvalidArgument(
-            "The rank of Input(Anchor) should be 2, but received Anchor "
-            "rank is :%d, Anchor shape is:[%s].",
-            anchor_dims.size(),
-            anchor_dims));
-    PADDLE_ENFORCE_EQ(
-        gt_boxes_dims.size(),
-        2,
-        phi::errors::InvalidArgument(
-            "The rank of Input(GtBoxes) should be 2, but received GtBoxes "
-            "rank is :%d, GtBoxes shape is:[%s].",
-            gt_boxes_dims.size(),
-            gt_boxes_dims));
-    PADDLE_ENFORCE_EQ(
-        gt_labels_dims.size(),
-        2,
-        phi::errors::InvalidArgument(
-            "The rank of Input(GtLabels) should be 2, but received GtLabels "
-            "rank is :%d, GtLabels shape is:[%s].",
-            gt_labels_dims.size(),
-            gt_labels_dims));
-    PADDLE_ENFORCE_EQ(
-        im_info_dims.size(),
-        2,
-        phi::errors::InvalidArgument(
-            "The rank of Input(ImInfo) should be 2, but received ImInfo "
-            "rank is :%d, ImInfo shape is:[%s].",
-            im_info_dims.size(),
-            im_info_dims));
-
-    ctx->SetOutputDim("LocationIndex", {gt_labels_dims[0]});
-    ctx->SetOutputDim("ScoreIndex", {gt_labels_dims[0]});
-    ctx->SetOutputDim("TargetBBox", {gt_labels_dims[0], 4});
-    ctx->SetOutputDim("TargetLabel", {gt_labels_dims[0], 1});
-    ctx->SetOutputDim("BBoxInsideWeight", {gt_labels_dims[0], 4});
-    ctx->SetOutputDim("ForegroundNumber", {gt_labels_dims[0], 1});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Anchor"),
-        platform::CPUPlace());
-  }
-};
-
-template <typename T>
-std::vector<phi::DenseTensor> FilterCrowdGtBoxLabel(
-    const phi::CPUContext& context,
-    phi::DenseTensor* gt_boxes,
-    phi::DenseTensor* gt_labels,
-    phi::DenseTensor* is_crowd) {
-  int gt_num = static_cast<int>(gt_boxes->dims()[0]);
-  std::vector<int> not_crowd_inds;
-  auto* is_crowd_data = is_crowd->data<int>();
-  for (int i = 0; i < gt_num; ++i) {
-    if (is_crowd_data[i] == 0) {
-      not_crowd_inds.emplace_back(i);
-    }
-  }
-  int ncrowd_num = static_cast<int>(not_crowd_inds.size());
-  phi::DenseTensor ncrowd_gt_boxes, ncrowd_gt_labels;
-  T* ncrowd_gt_boxes_data =
-      ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
-  int* ncrowd_gt_labels_data =
-      ncrowd_gt_labels.mutable_data<int>({ncrowd_num, 1}, context.GetPlace());
-  Gather<T>(gt_boxes->data<T>(),
-            4,
-            not_crowd_inds.data(),
-            ncrowd_num,
-            ncrowd_gt_boxes_data);
-  Gather<int>(gt_labels->data<int>(),
-              1,
-              not_crowd_inds.data(),
-              ncrowd_num,
-              ncrowd_gt_labels_data);
-  std::vector<phi::DenseTensor> res;
-  res.emplace_back(ncrowd_gt_boxes);
-  res.emplace_back(ncrowd_gt_labels);
-  return res;
-}
-
-template <typename T>
-std::vector<phi::DenseTensor> GetAllFgBgGt(
-    const phi::CPUContext& ctx,
-    const phi::DenseTensor& anchor_by_gt_overlap,
-    const phi::DenseTensor& ncrowd_gt_labels,
-    const float positive_overlap,
-    const float negative_overlap,
-    std::minstd_rand engine) {
-  auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data<T>();
-  int anchor_num = static_cast<int>(anchor_by_gt_overlap.dims()[0]);
-  int gt_num = static_cast<int>(anchor_by_gt_overlap.dims()[1]);
-
-  std::vector<int> fg_inds;
-  std::vector<int> bg_inds;
-  std::vector<int> gt_inds;
-  std::vector<int> tgt_lbl;
-  std::vector<int> fg_fake;
-  std::vector<T> bbox_inside_weight;
-  // Calculate the max IoU between anchors and gt boxes
-  // Map from anchor to gt box that has highest overlap
-  auto place = ctx.GetPlace();
-  phi::DenseTensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
-  anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
-  int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
-  gt_to_anchor_max.mutable_data<T>({gt_num}, place);
-
-  auto anchor_by_gt_overlap_et =
-      framework::EigenMatrix<T>::From(anchor_by_gt_overlap);
-  auto anchor_to_gt_max_et =
-      framework::EigenVector<T>::Flatten(anchor_to_gt_max);
-  auto gt_to_anchor_max_et =
-      framework::EigenVector<T>::Flatten(gt_to_anchor_max);
-  auto anchor_to_gt_argmax_et =
-      framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
-  anchor_to_gt_max_et =
-      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(1));
-  anchor_to_gt_argmax_et =
-      anchor_by_gt_overlap_et.argmax(1).template cast<int>();
-  gt_to_anchor_max_et =
-      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(0));
-
-  ScoreAssign(anchor_by_gt_overlap_data,
-              anchor_to_gt_max,
-              gt_to_anchor_max,
-              -1,
-              -1,
-              positive_overlap,
-              negative_overlap,
-              &fg_inds,
-              &bg_inds,
-              &tgt_lbl,
-              &fg_fake,
-              &bbox_inside_weight,
-              engine,
-              false);
-  const int* gt_labels_data = ncrowd_gt_labels.data<int>();
-  int64_t fg_num = static_cast<int64_t>(fg_inds.size());
-  for (int64_t i = 0; i < fg_num; ++i) {
-    int gt_idx = argmax[fg_inds[i]];
-    tgt_lbl[i] = gt_labels_data[gt_idx];
-  }
-
-  int bg_num = static_cast<int>(bg_inds.size());
-  int fg_fake_num = static_cast<int>(fg_fake.size());
-  gt_inds.reserve(fg_fake_num);
-  for (int i = 0; i < fg_fake_num; ++i) {
-    gt_inds.emplace_back(argmax[fg_fake[i]]);
-  }
-
-  phi::DenseTensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t,
-      bbox_inside_weight_t;
-  phi::DenseTensor fg_num_t;
-  int* loc_index_data = loc_index_t.mutable_data<int>({fg_fake_num}, place);
-  int* score_index_data =
-      score_index_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* tgt_lbl_data = tgt_lbl_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_fake_num}, place);
-  int* fg_num_data = fg_num_t.mutable_data<int>({1}, place);
-  T* bbox_inside_weight_data =
-      bbox_inside_weight_t.mutable_data<T>({fg_fake_num, 4}, place);
-  std::copy(fg_fake.begin(), fg_fake.end(), loc_index_data);
-  std::copy(fg_inds.begin(), fg_inds.end(), score_index_data);
-  std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num);
-  std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data);
-  std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data);
-  std::copy(bbox_inside_weight.begin(),
-            bbox_inside_weight.end(),
-            bbox_inside_weight_data);
-  fg_num_data[0] = static_cast<int>(fg_fake.size()) + 1;
-  std::vector<phi::DenseTensor> loc_score_tgtlbl_gt;
-  loc_score_tgtlbl_gt.emplace_back(loc_index_t);
-  loc_score_tgtlbl_gt.emplace_back(score_index_t);
-  loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
-  loc_score_tgtlbl_gt.emplace_back(gt_inds_t);
-  loc_score_tgtlbl_gt.emplace_back(bbox_inside_weight_t);
-  loc_score_tgtlbl_gt.emplace_back(fg_num_t);
-
-  return loc_score_tgtlbl_gt;
-}
-
-template <typename T, typename DeviceContext>
-class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* anchor = context.Input<phi::DenseTensor>("Anchor");  // (H*W*A) * 4
-    auto* gt_boxes = context.Input<phi::DenseTensor>("GtBoxes");
-    auto* gt_labels = context.Input<phi::DenseTensor>("GtLabels");
-    auto* is_crowd = context.Input<phi::DenseTensor>("IsCrowd");
-    auto* im_info = context.Input<phi::DenseTensor>("ImInfo");
-
-    auto* loc_index = context.Output<phi::DenseTensor>("LocationIndex");
-    auto* score_index = context.Output<phi::DenseTensor>("ScoreIndex");
-    auto* tgt_bbox = context.Output<phi::DenseTensor>("TargetBBox");
-    auto* tgt_lbl = context.Output<phi::DenseTensor>("TargetLabel");
-    auto* bbox_inside_weight =
-        context.Output<phi::DenseTensor>("BBoxInsideWeight");
-    auto* fg_num = context.Output<phi::DenseTensor>("ForegroundNumber");
-
-    PADDLE_ENFORCE_EQ(
-        gt_boxes->lod().size(),
-        1UL,
-        phi::errors::InvalidArgument(
-            "The LoD level of Input(GtBoxes) should be 1, but received GtBoxes "
-            "LoD level is :%d.",
-            gt_boxes->lod().size()));
-    PADDLE_ENFORCE_EQ(
-        gt_labels->lod().size(),
-        1UL,
-        phi::errors::InvalidArgument("The LoD level of Input(GtLabels) "
-                                     "should be 1, but received GtLabels "
-                                     "LoD level is :%d.",
-                                     gt_labels->lod().size()));
-    PADDLE_ENFORCE_EQ(
-        is_crowd->lod().size(),
-        1UL,
-        phi::errors::InvalidArgument(
-            "The LoD level of Input(IsCrowd) should be 1, but received IsCrowd "
-            "LoD level is :%d.",
-            is_crowd->lod().size()));
-
-    int64_t anchor_num = static_cast<int64_t>(anchor->dims()[0]);
-    int64_t batch_num = static_cast<int64_t>(gt_boxes->lod().back().size() - 1);
-
-    float positive_overlap = context.Attr<float>("positive_overlap");
-    float negative_overlap = context.Attr<float>("negative_overlap");
-
-    int64_t max_num = batch_num * anchor_num;
-    auto place = context.GetPlace();
-
-    loc_index->mutable_data<int>({max_num}, place);
-    score_index->mutable_data<int>({max_num}, place);
-    tgt_bbox->mutable_data<T>({max_num, 4}, place);
-    tgt_lbl->mutable_data<int>({max_num, 1}, place);
-    bbox_inside_weight->mutable_data<T>({max_num, 4}, place);
-    fg_num->mutable_data<int>({batch_num, 1}, place);
-    auto& dev_ctx = context.device_context<phi::CPUContext>();
-
-    std::random_device rnd;
-    std::minstd_rand engine;
-    int seed = static_cast<int>(rnd());
-    engine.seed(seed);
-
-    framework::LoD lod_loc, loc_score, lod_fg;
-    std::vector<size_t> lod0_loc(1, 0);
-    std::vector<size_t> lod0_score(1, 0);
-    std::vector<size_t> lod0_fg(1, 0);
-
-    int total_loc_num = 0;
-    int total_score_num = 0;
-    int total_fg_num = 0;
-    auto gt_boxes_lod = gt_boxes->lod().back();
-    auto gt_labels_lod = gt_labels->lod().back();
-    auto is_crowd_lod = is_crowd->lod().back();
-    for (int i = 0; i < batch_num; ++i) {
-      phi::DenseTensor gt_boxes_slice =
-          gt_boxes->Slice(static_cast<int64_t>(gt_boxes_lod[i]),
-                          static_cast<int64_t>(gt_boxes_lod[i + 1]));
-      phi::DenseTensor gt_labels_slice =
-          gt_labels->Slice(static_cast<int64_t>(gt_labels_lod[i]),
-                           static_cast<int64_t>(gt_labels_lod[i + 1]));
-      phi::DenseTensor is_crowd_slice =
-          is_crowd->Slice(static_cast<int64_t>(is_crowd_lod[i]),
-                          static_cast<int64_t>(is_crowd_lod[i + 1]));
-      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
-      auto* im_info_data = im_info_slice.data<T>();
-      auto im_height = im_info_data[0];
-      auto im_width = im_info_data[1];
-      auto im_scale = im_info_data[2];
-
-      // Filter straddle anchor
-      std::vector<phi::DenseTensor> filter_output =
-          FilterStraddleAnchor<T>(dev_ctx, anchor, -1, im_height, im_width);
-      phi::DenseTensor inds_inside = filter_output[0];
-      phi::DenseTensor inside_anchor = filter_output[1];
-
-      // Filter crowd gt
-      std::vector<phi::DenseTensor> ncrowd_output = FilterCrowdGtBoxLabel<T>(
-          dev_ctx, &gt_boxes_slice, &gt_labels_slice, &is_crowd_slice);
-      phi::DenseTensor ncrowd_gt_boxes = ncrowd_output[0];
-      phi::DenseTensor ncrowd_gt_labels = ncrowd_output[1];
-
-      auto ncrowd_gt_boxes_et =
-          framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
-      ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
-
-      phi::DenseTensor anchor_by_gt_overlap;
-      anchor_by_gt_overlap.mutable_data<T>(
-          {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
-      BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
-
-      auto loc_score_tgtlbl_gt = GetAllFgBgGt<T>(dev_ctx,
-                                                 anchor_by_gt_overlap,
-                                                 ncrowd_gt_labels,
-                                                 positive_overlap,
-                                                 negative_overlap,
-                                                 engine);
-
-      phi::DenseTensor sampled_loc_index = loc_score_tgtlbl_gt[0];
-      phi::DenseTensor sampled_score_index = loc_score_tgtlbl_gt[1];
-      phi::DenseTensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
-      phi::DenseTensor sampled_gt_index = loc_score_tgtlbl_gt[3];
-      phi::DenseTensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
-      phi::DenseTensor sampled_fg_num = loc_score_tgtlbl_gt[5];
-
-      int loc_num = static_cast<int>(sampled_loc_index.dims()[0]);
-      int score_num = static_cast<int>(sampled_score_index.dims()[0]);
-      // unmap to all anchor
-      phi::DenseTensor sampled_loc_index_unmap, sampled_score_index_unmap;
-      sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
-      sampled_score_index_unmap.mutable_data<int>({score_num}, place);
-      Gather<int>(inds_inside.data<int>(),
-                  1,
-                  sampled_loc_index.data<int>(),
-                  loc_num,
-                  sampled_loc_index_unmap.data<int>());
-      Gather<int>(inds_inside.data<int>(),
-                  1,
-                  sampled_score_index.data<int>(),
-                  score_num,
-                  sampled_score_index_unmap.data<int>());
-
-      // get target bbox deltas
-      phi::DenseTensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
-      auto* sampled_anchor_data =
-          sampled_anchor.mutable_data<T>({loc_num, 4}, place);
-      auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
-      Gather<T>(anchor->data<T>(),
-                4,
-                sampled_loc_index_unmap.data<int>(),
-                loc_num,
-                sampled_anchor_data);
-      Gather<T>(ncrowd_gt_boxes.data<T>(),
-                4,
-                sampled_gt_index.data<int>(),
-                loc_num,
-                sampled_gt_data);
-      sampled_tgt_bbox.mutable_data<T>({loc_num, 4}, place);
-      BoxToDelta<T>(loc_num,
-                    sampled_anchor,
-                    sampled_gt,
-                    nullptr,
-                    false,
-                    &sampled_tgt_bbox);
-
-      // Add anchor offset
-      int anchor_offset = static_cast<int>(i * anchor_num);
-      auto sampled_loc_index_unmap_et =
-          framework::EigenTensor<int, 1>::From(sampled_loc_index_unmap);
-      sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset;
-      auto sampled_score_index_unmap_et =
-          framework::EigenTensor<int, 1>::From(sampled_score_index_unmap);
-      sampled_score_index_unmap_et =
-          sampled_score_index_unmap_et + anchor_offset;
-      AppendRpns<int>(loc_index, total_loc_num, &sampled_loc_index_unmap);
-      AppendRpns<int>(score_index, total_score_num, &sampled_score_index_unmap);
-      AppendRpns<T>(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox);
-      AppendRpns<int>(tgt_lbl, total_score_num, &sampled_tgtlbl);
-      AppendRpns<T>(
-          bbox_inside_weight, total_loc_num * 4, &sampled_bbox_inside_weight);
-      AppendRpns<int>(fg_num, total_fg_num, &sampled_fg_num);
-
-      total_loc_num += loc_num;
-      total_score_num += score_num;
-      total_fg_num += 1;
-      lod0_loc.emplace_back(total_loc_num);
-      lod0_score.emplace_back(total_score_num);
-      lod0_fg.emplace_back(total_fg_num);
-    }
-
-    PADDLE_ENFORCE_LE(
-        total_loc_num,
-        max_num,
-        phi::errors::InvalidArgument(
-            "The number of sampled bboxes should not be greater than the "
-            "number of all anchor boxes(%d), but the number of sampled "
-            "bboxes is :%d.",
-            max_num,
-            total_loc_num));
-    PADDLE_ENFORCE_LE(
-        total_score_num,
-        max_num,
-        phi::errors::InvalidArgument(
-            "The number of sampled scores should not be greater than the "
-            "number of all anchor boxes(%d), but the number of sampled "
-            "scores is :%d.",
-            max_num,
-            total_score_num));
-    PADDLE_ENFORCE_LE(
-        total_fg_num,
-        batch_num,
-        phi::errors::InvalidArgument(
-            "The number of foreground numbers should not be greater than the "
-            "batch size(%d), but the number of foreground numbers is :%d.",
-            batch_num,
-            total_fg_num));
-
-    lod_loc.emplace_back(lod0_loc);
-    loc_score.emplace_back(lod0_score);
-    lod_fg.emplace_back(lod0_fg);
-    loc_index->set_lod(lod_loc);
-    score_index->set_lod(loc_score);
-    tgt_bbox->set_lod(lod_loc);
-    tgt_lbl->set_lod(loc_score);
-    bbox_inside_weight->set_lod(lod_loc);
-    fg_num->set_lod(lod_fg);
-    loc_index->Resize({total_loc_num});
-    score_index->Resize({total_score_num});
-    tgt_bbox->Resize({total_loc_num, 4});
-    tgt_lbl->Resize({total_score_num, 1});
-    bbox_inside_weight->Resize({total_loc_num, 4});
-    fg_num->Resize({total_fg_num, 1});
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    rpn_target_assign,
-    ops::RpnTargetAssignOp,
-    ops::RpnTargetAssignOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-PD_REGISTER_STRUCT_KERNEL(rpn_target_assign,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::RpnTargetAssignKernel,
-                          float,
-                          double) {}
-REGISTER_OPERATOR(
-    retinanet_target_assign,
-    ops::RetinanetTargetAssignOp,
-    ops::RetinanetTargetAssignOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-PD_REGISTER_STRUCT_KERNEL(retinanet_target_assign,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::RetinanetTargetAssignKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
deleted file mode 100644
index cee37d49eb69b..0000000000000
--- a/paddle/fluid/operators/detection_map_op.cc
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection_map_op.h"
-
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class DetectionMAPOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("DetectRes"), "Input", "DetectRes", "DetectionMAP");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "DetectionMAP");
-    OP_INOUT_CHECK(ctx->HasOutput("AccumPosCount"),
-                   "Output",
-                   "AccumPosCount",
-                   "DetectionMAP");
-    OP_INOUT_CHECK(ctx->HasOutput("AccumTruePos"),
-                   "Output",
-                   "AccumTruePos",
-                   "DetectionMAP");
-    OP_INOUT_CHECK(ctx->HasOutput("AccumFalsePos"),
-                   "Output",
-                   "AccumFalsePos",
-                   "DetectionMAP");
-    OP_INOUT_CHECK(ctx->HasOutput("MAP"), "Output", "MAP", "DetectionMAP");
-
-    auto det_dims = ctx->GetInputDim("DetectRes");
-    PADDLE_ENFORCE_EQ(
-        det_dims.size(),
-        2UL,
-        phi::errors::InvalidArgument(
-            "Input(DetectRes) ndim must be 2, the shape is [N, 6],"
-            "but received the ndim is %d",
-            det_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        det_dims[1],
-        6UL,
-        phi::errors::InvalidArgument(
-            "The shape is of Input(DetectRes) [N, 6], but received"
-            " shape is [N, %d]",
-            det_dims[1]));
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(label_dims.size(),
-                      2,
-                      phi::errors::InvalidArgument(
-                          "The ndim of Input(Label) must be 2, but received %d",
-                          label_dims.size()));
-    if (ctx->IsRuntime() || label_dims[1] > 0) {
-      PADDLE_ENFORCE_EQ(
-          (label_dims[1] == 6 || label_dims[1] == 5),
-          true,
-          phi::errors::InvalidArgument(
-              "The shape of Input(Label) is [N, 6] or [N, 5], but received "
-              "[N, %d]",
-              label_dims[1]));
-    }
-
-    if (ctx->HasInput("PosCount")) {
-      PADDLE_ENFORCE(
-          ctx->HasInput("TruePos"),
-          phi::errors::InvalidArgument(
-              "Input(TruePos) of DetectionMAPOp should not be null when "
-              "Input(PosCount) is not null."));
-      PADDLE_ENFORCE(
-          ctx->HasInput("FalsePos"),
-          phi::errors::InvalidArgument(
-              "Input(FalsePos) of DetectionMAPOp should not be null when "
-              "Input(PosCount) is not null."));
-    }
-
-    ctx->SetOutputDim("MAP", common::make_ddim({1}));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "DetectRes"),
-        platform::CPUPlace());
-  }
-};
-
-class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("DetectRes",
-             "(phi::DenseTensor) A 2-D phi::DenseTensor with shape [M, 6] "
-             "represents the "
-             "detections. Each row has 6 values: "
-             "[label, confidence, xmin, ymin, xmax, ymax], M is the total "
-             "number of detect results in this mini-batch. For each instance, "
-             "the offsets in first dimension are called LoD, the number of "
-             "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
-             "no detected data.");
-    AddInput("Label",
-             "(phi::DenseTensor) A 2-D phi::DenseTensor represents the"
-             "Labeled ground-truth data. Each row has 6 values: "
-             "[label, xmin, ymin, xmax, ymax, is_difficult] or 5 values: "
-             "[label, xmin, ymin, xmax, ymax], where N is the total "
-             "number of ground-truth data in this mini-batch. For each "
-             "instance, the offsets in first dimension are called LoD, "
-             "the number of offset is N + 1, if LoD[i + 1] - LoD[i] == 0, "
-             "means there is no ground-truth data.");
-    AddInput("HasState",
-             "(Tensor<int>) A tensor with shape [1], 0 means ignoring input "
-             "states, which including PosCount, TruePos, FalsePos.")
-        .AsDispensable();
-    AddInput("PosCount",
-             "(Tensor) A tensor with shape [Ncls, 1], store the "
-             "input positive example count of each class, Ncls is the count of "
-             "input classification. "
-             "This input is used to pass the AccumPosCount generated by the "
-             "previous mini-batch when the multi mini-batches cumulative "
-             "calculation carried out. "
-             "When the input(PosCount) is empty, the cumulative "
-             "calculation is not carried out, and only the results of the "
-             "current mini-batch are calculated.")
-        .AsDispensable();
-    AddInput("TruePos",
-             "(phi::DenseTensor) A 2-D phi::DenseTensor with shape [Ntp, 2], "
-             "store the "
-             "input true positive example of each class."
-             "This input is used to pass the AccumTruePos generated by the "
-             "previous mini-batch when the multi mini-batches cumulative "
-             "calculation carried out. ")
-        .AsDispensable();
-    AddInput("FalsePos",
-             "(phi::DenseTensor) A 2-D phi::DenseTensor with shape [Nfp, 2], "
-             "store the "
-             "input false positive example of each class."
-             "This input is used to pass the AccumFalsePos generated by the "
-             "previous mini-batch when the multi mini-batches cumulative "
-             "calculation carried out. ")
-        .AsDispensable();
-    AddOutput("AccumPosCount",
-              "(Tensor) A tensor with shape [Ncls, 1], store the "
-              "positive example count of each class. It combines the input "
-              "input(PosCount) and the positive example count computed from "
-              "input(Detection) and input(Label).");
-    AddOutput(
-        "AccumTruePos",
-        "(phi::DenseTensor) A phi::DenseTensor with shape [Ntp', 2], store the "
-        "true positive example of each class. It combines the "
-        "input(TruePos) and the true positive examples computed from "
-        "input(Detection) and input(Label).");
-    AddOutput(
-        "AccumFalsePos",
-        "(phi::DenseTensor) A phi::DenseTensor with shape [Nfp', 2], store the "
-        "false positive example of each class. It combines the "
-        "input(FalsePos) and the false positive examples computed from "
-        "input(Detection) and input(Label).");
-    AddOutput("MAP",
-              "(Tensor) A tensor with shape [1], store the mAP evaluate "
-              "result of the detection.");
-    AddAttr<int>("class_num",
-                 "(int) "
-                 "The class number.");
-    AddAttr<int>(
-        "background_label",
-        "(int, default: 0) "
-        "The index of background label, the background label will be ignored. "
-        "If set to -1, then all categories will be considered.")
-        .SetDefault(0);
-    AddAttr<float>(
-        "overlap_threshold",
-        "(float) "
-        "The lower bound jaccard overlap threshold of detection output and "
-        "ground-truth data.")
-        .SetDefault(.5f);
-    AddAttr<bool>("evaluate_difficult",
-                  "(bool, default true) "
-                  "Switch to control whether the difficult data is evaluated.")
-        .SetDefault(true);
-    AddAttr<std::string>("ap_type",
-                         "(string, default 'integral') "
-                         "The AP algorithm type, 'integral' or '11point'.")
-        .SetDefault("integral")
-        .InEnum({"integral", "11point"})
-        .AddCustomChecker([](const std::string& ap_type) {
-          PADDLE_ENFORCE_NE(
-              GetAPType(ap_type),
-              APType::kNone,
-              phi::errors::InvalidArgument(
-                  "The ap_type should be 'integral' or '11point."));
-        });
-    AddComment(R"DOC(
-Detection mAP evaluate operator.
-The general steps are as follows. First, calculate the true positive and
-false positive according to the input of detection and labels, then
-calculate the mAP evaluate value.
-Supporting '11 point' and 'integral' mAP algorithm. Please get more information
-from the following articles:
-https://sanchom.wordpress.com/tag/average-precision/
-https://arxiv.org/abs/1512.02325
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    detection_map,
-    ops::DetectionMAPOp,
-    ops::DetectionMAPOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(
-    detection_map, CPU, ALL_LAYOUT, ops::DetectionMAPOpKernel, float, double) {}
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
deleted file mode 100644
index 24fea9c431c63..0000000000000
--- a/paddle/fluid/operators/detection_map_op.h
+++ /dev/null
@@ -1,518 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <map>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-enum APType { kNone = 0, kIntegral, k11point };
-
-APType GetAPType(std::string str) {
-  if (str == "integral") {
-    return APType::kIntegral;
-  } else if (str == "11point") {
-    return APType::k11point;
-  } else {
-    return APType::kNone;
-  }
-}
-
-template <typename T>
-inline bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                                 const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <typename T>
-inline void GetAccumulation(std::vector<std::pair<T, int>> in_pairs,
-                            std::vector<int>* accu_vec) {
-  std::stable_sort(in_pairs.begin(), in_pairs.end(), SortScorePairDescend<int>);
-  accu_vec->clear();
-  size_t sum = 0;
-  for (size_t i = 0; i < in_pairs.size(); ++i) {
-    auto count = in_pairs[i].second;
-    sum += count;
-    accu_vec->push_back(sum);
-  }
-}
-
-template <typename T, typename DeviceContext>
-class DetectionMAPOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_detect = ctx.Input<phi::DenseTensor>("DetectRes");
-    auto* in_label = ctx.Input<phi::DenseTensor>("Label");
-    auto* out_map = ctx.Output<phi::DenseTensor>("MAP");
-
-    auto* in_pos_count = ctx.Input<phi::DenseTensor>("PosCount");
-    auto* in_true_pos = ctx.Input<phi::DenseTensor>("TruePos");
-    auto* in_false_pos = ctx.Input<phi::DenseTensor>("FalsePos");
-
-    auto* out_pos_count = ctx.Output<phi::DenseTensor>("AccumPosCount");
-    auto* out_true_pos = ctx.Output<phi::DenseTensor>("AccumTruePos");
-    auto* out_false_pos = ctx.Output<phi::DenseTensor>("AccumFalsePos");
-
-    float overlap_threshold = ctx.Attr<float>("overlap_threshold");
-    bool evaluate_difficult = ctx.Attr<bool>("evaluate_difficult");
-    auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
-    int class_num = ctx.Attr<int>("class_num");
-
-    auto& label_lod = in_label->lod();
-    auto& detect_lod = in_detect->lod();
-    PADDLE_ENFORCE_EQ(
-        label_lod.size(),
-        1UL,
-        phi::errors::InvalidArgument("Only support LodTensor of lod_level "
-                                     "with 1 in label, but received %d.",
-                                     label_lod.size()));
-    PADDLE_ENFORCE_EQ(label_lod[0].size(),
-                      detect_lod[0].size(),
-                      phi::errors::InvalidArgument(
-                          "The batch_size of input(Label) and input(Detection) "
-                          "must be the same, but received %d:%d",
-                          label_lod[0].size(),
-                          detect_lod[0].size()));
-
-    std::vector<std::map<int, std::vector<Box>>> gt_boxes;
-    std::vector<std::map<int, std::vector<std::pair<T, Box>>>> detect_boxes;
-
-    GetBoxes(*in_label, *in_detect, &gt_boxes, detect_boxes);
-
-    std::map<int, int> label_pos_count;
-    std::map<int, std::vector<std::pair<T, int>>> true_pos;
-    std::map<int, std::vector<std::pair<T, int>>> false_pos;
-
-    auto* has_state = ctx.Input<phi::DenseTensor>("HasState");
-    int state = 0;
-    if (has_state) {
-      state = has_state->data<int>()[0];
-    }
-
-    if (in_pos_count != nullptr && state) {
-      GetInputPos(*in_pos_count,
-                  *in_true_pos,
-                  *in_false_pos,
-                  &label_pos_count,
-                  &true_pos,
-                  &false_pos,
-                  class_num);
-    }
-
-    CalcTrueAndFalsePositive(gt_boxes,
-                             detect_boxes,
-                             evaluate_difficult,
-                             overlap_threshold,
-                             &label_pos_count,
-                             &true_pos,
-                             &false_pos);
-
-    int background_label = ctx.Attr<int>("background_label");
-    T map = CalcMAP(
-        ap_type, label_pos_count, true_pos, false_pos, background_label);
-
-    GetOutputPos(ctx,
-                 label_pos_count,
-                 true_pos,
-                 false_pos,
-                 out_pos_count,
-                 out_true_pos,
-                 out_false_pos,
-                 class_num);
-
-    T* map_data = out_map->mutable_data<T>(ctx.GetPlace());
-    map_data[0] = map;
-  }
-
- protected:
-  struct Box {
-    Box(T xmin, T ymin, T xmax, T ymax)
-        : xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax), is_difficult(false) {}
-
-    T xmin, ymin, xmax, ymax;
-    bool is_difficult;
-  };
-
-  inline T JaccardOverlap(const Box& box1, const Box& box2) const {
-    if (box2.xmin > box1.xmax || box2.xmax < box1.xmin ||
-        box2.ymin > box1.ymax || box2.ymax < box1.ymin) {
-      return 0.0;
-    } else {
-      T inter_xmin = std::max(box1.xmin, box2.xmin);
-      T inter_ymin = std::max(box1.ymin, box2.ymin);
-      T inter_xmax = std::min(box1.xmax, box2.xmax);
-      T inter_ymax = std::min(box1.ymax, box2.ymax);
-
-      T inter_width = inter_xmax - inter_xmin;
-      T inter_height = inter_ymax - inter_ymin;
-      T inter_area = inter_width * inter_height;
-
-      T bbox_area1 = (box1.xmax - box1.xmin) * (box1.ymax - box1.ymin);
-      T bbox_area2 = (box2.xmax - box2.xmin) * (box2.ymax - box2.ymin);
-
-      return inter_area / (bbox_area1 + bbox_area2 - inter_area);
-    }
-  }
-
-  inline void ClipBBox(const Box& bbox, Box* clipped_bbox) const {
-    T one = static_cast<T>(1.0);
-    T zero = static_cast<T>(0.0);
-    clipped_bbox->xmin = std::max(std::min(bbox.xmin, one), zero);
-    clipped_bbox->ymin = std::max(std::min(bbox.ymin, one), zero);
-    clipped_bbox->xmax = std::max(std::min(bbox.xmax, one), zero);
-    clipped_bbox->ymax = std::max(std::min(bbox.ymax, one), zero);
-  }
-
-  void GetBoxes(const phi::DenseTensor& input_label,
-                const phi::DenseTensor& input_detect,
-                std::vector<std::map<int, std::vector<Box>>>* gt_boxes,
-                std::vector<std::map<int, std::vector<std::pair<T, Box>>>>&
-                    detect_boxes) const {
-    auto labels = framework::EigenTensor<T, 2>::From(input_label);
-    auto detect = framework::EigenTensor<T, 2>::From(input_detect);
-
-    auto& label_lod = input_label.lod();
-    auto& detect_lod = input_detect.lod();
-
-    int batch_size = label_lod[0].size() - 1;
-    auto& label_index = label_lod[0];
-
-    for (int n = 0; n < batch_size; ++n) {
-      std::map<int, std::vector<Box>> boxes;
-      for (size_t i = label_index[n]; i < label_index[n + 1]; ++i) {
-        int label = labels(i, 0);
-        if (input_label.dims()[1] == 6) {
-          Box box(labels(i, 2), labels(i, 3), labels(i, 4), labels(i, 5));
-          auto is_difficult = labels(i, 1);
-          if (std::abs(is_difficult - 0.0) < 1e-6)
-            box.is_difficult = false;
-          else
-            box.is_difficult = true;
-          boxes[label].push_back(box);
-        } else {
-          PADDLE_ENFORCE_EQ(
-              input_label.dims()[1],
-              5,
-              phi::errors::InvalidArgument(
-                  "The input label width"
-                  " must be 5, but received %d, please check your input data",
-                  input_label.dims()[1]));
-          Box box(labels(i, 1), labels(i, 2), labels(i, 3), labels(i, 4));
-          boxes[label].push_back(box);
-        }
-      }
-      gt_boxes->push_back(boxes);
-    }
-
-    auto detect_index = detect_lod[0];
-    for (int n = 0; n < batch_size; ++n) {
-      std::map<int, std::vector<std::pair<T, Box>>> boxes;
-      for (size_t i = detect_index[n]; i < detect_index[n + 1]; ++i) {
-        Box box(detect(i, 2), detect(i, 3), detect(i, 4), detect(i, 5));
-        int label = detect(i, 0);
-        auto score = detect(i, 1);
-        boxes[label].push_back(std::make_pair(score, box));
-      }
-      detect_boxes.push_back(boxes);
-    }
-  }
-
-  void GetOutputPos(
-      const framework::ExecutionContext& ctx,
-      const std::map<int, int>& label_pos_count,
-      const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
-      const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
-      phi::DenseTensor* output_pos_count,
-      phi::DenseTensor* output_true_pos,
-      phi::DenseTensor* output_false_pos,
-      const int class_num) const {
-    int true_pos_count = 0;
-    int false_pos_count = 0;
-    for (auto it = true_pos.begin(); it != true_pos.end(); ++it) {
-      auto tp = it->second;
-      true_pos_count += tp.size();
-    }
-    for (auto it = false_pos.begin(); it != false_pos.end(); ++it) {
-      auto fp = it->second;
-      false_pos_count += fp.size();
-    }
-
-    int* pos_count_data = output_pos_count->mutable_data<int>(
-        common::make_ddim({class_num, 1}), ctx.GetPlace());
-
-    T* true_pos_data = output_true_pos->mutable_data<T>(
-        common::make_ddim({true_pos_count, 2}), ctx.GetPlace());
-    T* false_pos_data = output_false_pos->mutable_data<T>(
-        common::make_ddim({false_pos_count, 2}), ctx.GetPlace());
-    true_pos_count = 0;
-    false_pos_count = 0;
-    std::vector<size_t> true_pos_starts = {0};
-    std::vector<size_t> false_pos_starts = {0};
-    for (int i = 0; i < class_num; ++i) {
-      auto it_count = label_pos_count.find(i);
-      pos_count_data[i] = 0;
-      if (it_count != label_pos_count.end()) {
-        pos_count_data[i] = it_count->second;
-      }
-      auto it_true_pos = true_pos.find(i);
-      if (it_true_pos != true_pos.end()) {
-        const std::vector<std::pair<T, int>>& true_pos_vec =
-            it_true_pos->second;
-        for (const std::pair<T, int>& tp : true_pos_vec) {
-          true_pos_data[true_pos_count * 2] = tp.first;
-          true_pos_data[true_pos_count * 2 + 1] = static_cast<T>(tp.second);
-          true_pos_count++;
-        }
-      }
-      true_pos_starts.push_back(true_pos_count);
-
-      auto it_false_pos = false_pos.find(i);
-      if (it_false_pos != false_pos.end()) {
-        const std::vector<std::pair<T, int>>& false_pos_vec =
-            it_false_pos->second;
-        for (const std::pair<T, int>& fp : false_pos_vec) {
-          false_pos_data[false_pos_count * 2] = fp.first;
-          false_pos_data[false_pos_count * 2 + 1] = static_cast<T>(fp.second);
-          false_pos_count++;
-        }
-      }
-      false_pos_starts.push_back(false_pos_count);
-    }
-
-    framework::LoD true_pos_lod;
-    true_pos_lod.emplace_back(true_pos_starts);
-    framework::LoD false_pos_lod;
-    false_pos_lod.emplace_back(false_pos_starts);
-
-    output_true_pos->set_lod(true_pos_lod);
-    output_false_pos->set_lod(false_pos_lod);
-  }
-
-  void GetInputPos(const phi::DenseTensor& input_pos_count,
-                   const phi::DenseTensor& input_true_pos,
-                   const phi::DenseTensor& input_false_pos,
-                   std::map<int, int>* label_pos_count,
-                   std::map<int, std::vector<std::pair<T, int>>>* true_pos,
-                   std::map<int, std::vector<std::pair<T, int>>>* false_pos,
-                   const int class_num) const {
-    const int* pos_count_data = input_pos_count.data<int>();
-    for (int i = 0; i < class_num; ++i) {
-      (*label_pos_count)[i] = pos_count_data[i];
-    }
-
-    auto SetData = [](const phi::DenseTensor& pos_tensor,
-                      std::map<int, std::vector<std::pair<T, int>>>& pos) {
-      const T* pos_data = pos_tensor.data<T>();
-      auto& pos_data_lod = pos_tensor.lod()[0];
-      for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
-        for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
-          T score = pos_data[j * 2];
-          int flag = pos_data[j * 2 + 1];
-          pos[i].push_back(std::make_pair(score, flag));
-        }
-      }
-    };
-
-    SetData(input_true_pos, *true_pos);
-    SetData(input_false_pos, *false_pos);
-    return;
-  }
-
-  void CalcTrueAndFalsePositive(
-      const std::vector<std::map<int, std::vector<Box>>>& gt_boxes,
-      const std::vector<std::map<int, std::vector<std::pair<T, Box>>>>&
-          detect_boxes,
-      bool evaluate_difficult,
-      float overlap_threshold,
-      std::map<int, int>* label_pos_count,
-      std::map<int, std::vector<std::pair<T, int>>>* true_pos,
-      std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
-    int batch_size = gt_boxes.size();
-    for (int n = 0; n < batch_size; ++n) {
-      auto& image_gt_boxes = gt_boxes[n];
-      for (auto& image_gt_box : image_gt_boxes) {
-        size_t count = 0;
-        auto& labeled_bboxes = image_gt_box.second;
-        if (evaluate_difficult) {
-          count = labeled_bboxes.size();
-        } else {
-          for (auto& box : labeled_bboxes) {
-            if (!box.is_difficult) {
-              ++count;
-            }
-          }
-        }
-        if (count == 0) {
-          continue;
-        }
-        int label = image_gt_box.first;
-        if (label_pos_count->find(label) == label_pos_count->end()) {
-          (*label_pos_count)[label] = count;
-        } else {
-          (*label_pos_count)[label] += count;
-        }
-      }
-    }
-
-    for (size_t n = 0; n < detect_boxes.size(); ++n) {
-      auto image_gt_boxes = gt_boxes[n];
-      auto detections = detect_boxes[n];
-
-      if (image_gt_boxes.size() == 0) {
-        for (auto it = detections.begin(); it != detections.end(); ++it) {
-          auto pred_boxes = it->second;
-          int label = it->first;
-          for (size_t i = 0; i < pred_boxes.size(); ++i) {
-            auto score = pred_boxes[i].first;
-            (*true_pos)[label].push_back(std::make_pair(score, 0));
-            (*false_pos)[label].push_back(std::make_pair(score, 1));
-          }
-        }
-        continue;
-      }
-
-      for (auto it = detections.begin(); it != detections.end(); ++it) {
-        int label = it->first;
-        auto pred_boxes = it->second;
-        if (image_gt_boxes.find(label) == image_gt_boxes.end()) {
-          for (size_t i = 0; i < pred_boxes.size(); ++i) {
-            auto score = pred_boxes[i].first;
-            (*true_pos)[label].push_back(std::make_pair(score, 0));
-            (*false_pos)[label].push_back(std::make_pair(score, 1));
-          }
-          continue;
-        }
-
-        auto matched_bboxes = image_gt_boxes.find(label)->second;
-        std::vector<bool> visited(matched_bboxes.size(), false);
-        // Sort detections in descend order based on scores
-        std::sort(
-            pred_boxes.begin(), pred_boxes.end(), SortScorePairDescend<Box>);
-        for (size_t i = 0; i < pred_boxes.size(); ++i) {
-          T max_overlap = -1.0;
-          size_t max_idx = 0;
-          auto score = pred_boxes[i].first;
-          for (size_t j = 0; j < matched_bboxes.size(); ++j) {
-            Box& pred_box = pred_boxes[i].second;
-            ClipBBox(pred_box, &pred_box);
-            T overlap = JaccardOverlap(pred_box, matched_bboxes[j]);
-            if (overlap > max_overlap) {
-              max_overlap = overlap;
-              max_idx = j;
-            }
-          }
-          if (max_overlap > overlap_threshold) {
-            bool match_evaluate_difficult =
-                evaluate_difficult ||
-                (!evaluate_difficult && !matched_bboxes[max_idx].is_difficult);
-            if (match_evaluate_difficult) {
-              if (!visited[max_idx]) {
-                (*true_pos)[label].push_back(std::make_pair(score, 1));
-                (*false_pos)[label].push_back(std::make_pair(score, 0));
-                visited[max_idx] = true;
-              } else {
-                (*true_pos)[label].push_back(std::make_pair(score, 0));
-                (*false_pos)[label].push_back(std::make_pair(score, 1));
-              }
-            }
-          } else {
-            (*true_pos)[label].push_back(std::make_pair(score, 0));
-            (*false_pos)[label].push_back(std::make_pair(score, 1));
-          }
-        }
-      }
-    }
-  }
-
-  T CalcMAP(APType ap_type,
-            const std::map<int, int>& label_pos_count,
-            const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
-            const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
-            const int background_label) const {
-    T mAP = 0.0;
-    int count = 0;
-    for (auto it = label_pos_count.begin(); it != label_pos_count.end(); ++it) {
-      int label = it->first;
-      int label_num_pos = it->second;
-      if (label_num_pos == background_label) {
-        continue;
-      }
-      if (true_pos.find(label) == true_pos.end()) {
-        count++;
-        continue;
-      }
-      auto label_true_pos = true_pos.find(label)->second;
-      auto label_false_pos = false_pos.find(label)->second;
-      // Compute average precision.
-      std::vector<int> tp_sum;
-      GetAccumulation<T>(label_true_pos, &tp_sum);
-      std::vector<int> fp_sum;
-      GetAccumulation<T>(label_false_pos, &fp_sum);
-      std::vector<T> precision, recall;
-      size_t num = tp_sum.size();
-      // Compute Precision.
-      for (size_t i = 0; i < num; ++i) {
-        precision.push_back(static_cast<T>(tp_sum[i]) /
-                            static_cast<T>(tp_sum[i] + fp_sum[i]));
-        recall.push_back(static_cast<T>(tp_sum[i]) / label_num_pos);
-      }
-      // VOC2007 style
-      if (ap_type == APType::k11point) {
-        std::vector<T> max_precisions(11, 0.0);
-        int start_idx = num - 1;
-        for (int j = 10; j >= 0; --j)
-          for (int i = start_idx; i >= 0; --i) {
-            if (recall[i] < j / 10.) {
-              start_idx = i;
-              if (j > 0) max_precisions[j - 1] = max_precisions[j];
-              break;
-            } else {
-              if (max_precisions[j] < precision[i])
-                max_precisions[j] = precision[i];
-            }
-          }
-        for (int j = 10; j >= 0; --j) mAP += max_precisions[j] / 11;
-        ++count;
-      } else if (ap_type == APType::kIntegral) {
-        // Nature integral
-        float average_precisions = 0.;
-        float prev_recall = 0.;
-        for (size_t i = 0; i < num; ++i) {
-          if (fabs(recall[i] - prev_recall) > 1e-6)
-            average_precisions += precision[i] * fabs(recall[i] - prev_recall);
-          prev_recall = recall[i];
-        }
-        mAP += average_precisions;
-        ++count;
-      } else {
-        PADDLE_THROW(phi::errors::Unimplemented(
-            "Unkown ap version %s. Now only supports integral and l1point.",
-            ap_type));
-      }
-    }
-    if (count != 0) mAP /= count;
-    return mAP;
-  }
-};  // namespace operators
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/enqueue_op.cc b/paddle/fluid/operators/enqueue_op.cc
deleted file mode 100644
index 225a2e067e190..0000000000000
--- a/paddle/fluid/operators/enqueue_op.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
-
-namespace paddle {
-namespace framework {
-class OpDesc;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-using LoDTensorBlockingQueueHolder =
-    paddle::operators::reader::LoDTensorBlockingQueueHolder;
-
-namespace paddle {
-namespace operators {
-
-class EnqueueOp : public framework::OperatorBase {
- public:
-  EnqueueOp(const std::string& type,
-            const framework::VariableNameMap& inputs,
-            const framework::VariableNameMap& outputs,
-            const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const std::string& queue_name = Attr<std::string>("queue_name");
-    auto* queue_holder_var = scope.FindVar(queue_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        queue_holder_var,
-        phi::errors::NotFound(
-            "No LoDTensorBlockingQueueHolder variable with name %s found.",
-            queue_name));
-    const std::string& var_name = Input("X");
-    auto* in_var = scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        in_var,
-        phi::errors::NotFound("No variable with name %s found.", var_name));
-    auto* in_tensor = in_var->GetMutable<phi::DenseTensor>();
-    auto* queue_holder =
-        queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
-
-    paddle::framework::LoDTensorArray lod_tensor_vec;
-    lod_tensor_vec.emplace_back(*in_tensor);
-    queue_holder->GetQueue()->Push(lod_tensor_vec);
-  }
-};
-
-class EnqueueOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "`lod_tensor` to enqueue");
-    AddAttr<std::string>("queue_name",
-                         "Name of the `LoDTensorBlockingQueueHolder` variable");
-    AddComment(R"DOC(
-      Enqueue operator.
-      )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = ::paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(enqueue, ops::EnqueueOp, ops::EnqueueOpMaker);
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 713ad1931ce23..517761bdba8b5 100755
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -16,16 +16,12 @@ register_operators(
   fused_feedforward_op
   fused_multi_transformer_op
   fused_multi_transformer_int8_op
-  resnet_unit_op
   fused_gemm_epilogue_op
-  fused_gate_attention_op
-  resnet_basic_block_op)
+  fused_gate_attention_op)
 
 op_library(fusion_lstm_op)
 
 if(WITH_XPU)
-  op_library(resnet_basic_block_op)
-  op_library(resnet_unit_op)
   op_library(fused_gemm_epilogue_op)
   op_library(fused_attention_op)
   op_library(fused_feedforward_op)
@@ -60,10 +56,6 @@ if(WITH_GPU OR WITH_ROCM)
     op_library(fused_multi_transformer_op)
     op_library(fused_multi_transformer_int8_op)
   endif()
-  # resnet_unit needs cudnn 8.0 above
-  if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))
-    op_library(resnet_unit_op)
-  endif()
 
   if(CUDA_VERSION GREATER_EQUAL 11.6)
     op_library(fused_gemm_epilogue_op)
diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op.cc b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
deleted file mode 100644
index 37315367189fa..0000000000000
--- a/paddle/fluid/operators/fused/resnet_basic_block_op.cc
+++ /dev/null
@@ -1,566 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/common/ddim.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class ResNetBasicBlockOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const {
-    // Check input
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Filter1"), "Input", "Filter1", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Scale1"), "Input", "Scale1", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Bias1"), "Input", "Bias1", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Mean1"), "Input", "Mean1", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Var1"), "Input", "Var1", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Filter2"), "Input", "Filter2", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Scale2"), "Input", "Scale2", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Bias2"), "Input", "Bias2", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Mean2"), "Input", "Mean2", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Var2"), "Input", "Var2", "ResNetBasicBlockOp");
-
-    bool has_shortcut = ctx->Attrs().Get<bool>("has_shortcut");
-    if (has_shortcut) {
-      OP_INOUT_CHECK(
-          ctx->HasInput("Filter3"), "Input", "Filter3", "ResNetBasicBlockOp");
-      OP_INOUT_CHECK(
-          ctx->HasInput("Scale3"), "Input", "Scale3", "ResNetBasicBlockOp");
-      OP_INOUT_CHECK(
-          ctx->HasInput("Bias3"), "Input", "Bias3", "ResNetBasicBlockOp");
-      OP_INOUT_CHECK(
-          ctx->HasInput("Mean3"), "Input", "Mean3", "ResNetBasicBlockOp");
-      OP_INOUT_CHECK(
-          ctx->HasInput("Var3"), "Input", "Var3", "ResNetBasicBlockOp");
-    }
-
-    // Check output
-    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Conv1"), "Output", "Conv1", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(ctx->HasOutput("SavedMean1"),
-                   "Output",
-                   "SavedMean1",
-                   "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(ctx->HasOutput("SavedInvstd1"),
-                   "Output",
-                   "SavedInvstd1",
-                   "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Mean1Out"), "Output", "Mean1Out", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Var1Out"), "Output", "Var1Out", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Conv2"), "Output", "Conv2", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(ctx->HasOutput("SavedMean2"),
-                   "Output",
-                   "SavedMean2",
-                   "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(ctx->HasOutput("SavedInvstd2"),
-                   "Output",
-                   "SavedInvstd2",
-                   "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Mean2Out"), "Output", "Mean2Out", "ResNetBasicBlockOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Var2Out"), "Output", "Var2Out", "ResNetBasicBlockOp");
-    if (has_shortcut) {
-      OP_INOUT_CHECK(
-          ctx->HasOutput("Conv3"), "Output", "Conv3", "ResNetBasicBlockOp");
-      OP_INOUT_CHECK(ctx->HasOutput("SavedMean3"),
-                     "Output",
-                     "SavedMean3",
-                     "ResNetBasicBlockOp");
-      OP_INOUT_CHECK(ctx->HasOutput("SavedInvstd3"),
-                     "Output",
-                     "SavedInvstd3",
-                     "ResNetBasicBlockOp");
-      OP_INOUT_CHECK(ctx->HasOutput("Mean3Out"),
-                     "Output",
-                     "Mean3Out",
-                     "ResNetBasicBlockOp");
-      OP_INOUT_CHECK(
-          ctx->HasOutput("Var3Out"), "Output", "Var3Out", "ResNetBasicBlockOp");
-    }
-
-    // make sure Mean/RunningMean and Var/RunningVar share memory
-    PADDLE_ENFORCE_EQ(ctx->Inputs("Mean1")[0],
-                      ctx->Outputs("Mean1Out")[0],
-                      phi::errors::InvalidArgument(
-                          "Mean1 and Mean1Out should share the same memory"));
-    PADDLE_ENFORCE_EQ(ctx->Inputs("Var1")[0],
-                      ctx->Outputs("Var1Out")[0],
-                      phi::errors::InvalidArgument(
-                          "Var1 and Var1Out should share the same memory"));
-    PADDLE_ENFORCE_EQ(ctx->Inputs("Mean2")[0],
-                      ctx->Outputs("Mean2Out")[0],
-                      phi::errors::InvalidArgument(
-                          "Mean2 and Mean2Out should share the same memory"));
-    PADDLE_ENFORCE_EQ(ctx->Inputs("Var2")[0],
-                      ctx->Outputs("Var2Out")[0],
-                      phi::errors::InvalidArgument(
-                          "Var2 and Var2Out should share the same memory"));
-
-    if (has_shortcut) {
-      PADDLE_ENFORCE_EQ(ctx->Inputs("Mean3")[0],
-                        ctx->Outputs("Mean3Out")[0],
-                        phi::errors::InvalidArgument(
-                            "Mean3 and Mean3Out should share the same memory"));
-      PADDLE_ENFORCE_EQ(ctx->Inputs("Var3")[0],
-                        ctx->Outputs("Var3Out")[0],
-                        phi::errors::InvalidArgument(
-                            "Var3 and Var3Out should share the same memory"));
-    }
-
-    // Check dims of inputs
-    auto data_format = ctx->Attrs().Get<std::string>("data_format");
-    PADDLE_ENFORCE_EQ(
-        data_format,
-        "NCHW",
-        phi::errors::InvalidArgument("The data format must equal to NCHW. "
-                                     "But received: the data format "
-                                     "= [%s]",
-                                     data_format));
-    int stride1 = ctx->Attrs().Get<int>("stride1");
-    int stride2 = ctx->Attrs().Get<int>("stride2");
-    int padding1 = ctx->Attrs().Get<int>("padding1");
-    int padding2 = ctx->Attrs().Get<int>("padding2");
-
-    const auto x1_dims = ctx->GetInputDim("X");
-    const auto w1_dims = ctx->GetInputDim("Filter1");
-    const auto bn1_param_dims = ctx->GetInputDim("Scale1");
-    PADDLE_ENFORCE_EQ(
-        x1_dims.size(),
-        4,
-        phi::errors::InvalidArgument("The dimensions of input "
-                                     "must equal to 4."
-                                     "But received: the shape of input "
-                                     "= [%s], the dimension of input = "
-                                     "[%d]",
-                                     x1_dims,
-                                     x1_dims.size()));
-
-    // Calculate the dims of output1
-    int batch = x1_dims[0];
-    int output1_channel = w1_dims[0];
-    int filter1_size = w1_dims[2];
-    int out1_h = (x1_dims[2] + padding1 * 2 - filter1_size) / stride1 + 1;
-    int out1_w = (x1_dims[3] + padding1 * 2 - filter1_size) / stride1 + 1;
-    std::vector<int> out1_shape = {batch, output1_channel, out1_h, out1_w};
-
-    const auto w2_dims = ctx->GetInputDim("Filter2");
-    const auto bn2_param_dims = ctx->GetInputDim("Scale2");
-    int output2_channel = w2_dims[0];
-    int filter2_size = w2_dims[2];
-    int out2_h = (out1_h + padding2 * 2 - filter2_size) / stride2 + 1;
-    int out2_w = (out1_w + padding2 * 2 - filter2_size) / stride2 + 1;
-    std::vector<int> out2_shape = {batch, output2_channel, out2_h, out2_w};
-
-    auto y_dims = common::make_ddim(out2_shape);
-    auto conv1_dims = common::make_ddim(out1_shape);
-    ctx->SetOutputDim("Y", y_dims);
-    ctx->SetOutputDim("Conv1", conv1_dims);
-    ctx->SetOutputDim("SavedMean1", bn1_param_dims);
-    ctx->SetOutputDim("SavedInvstd1", bn1_param_dims);
-    ctx->SetOutputDim("Mean1Out", bn1_param_dims);
-    ctx->SetOutputDim("Var1Out", bn1_param_dims);
-    ctx->SetOutputDim("Conv2", y_dims);
-    ctx->SetOutputDim("Conv2Input", conv1_dims);
-    ctx->SetOutputDim("SavedMean2", bn2_param_dims);
-    ctx->SetOutputDim("SavedInvstd2", bn2_param_dims);
-    ctx->SetOutputDim("Mean2Out", bn2_param_dims);
-    ctx->SetOutputDim("Var2Out", bn2_param_dims);
-    if (has_shortcut) {
-      ctx->SetOutputDim("Conv3", y_dims);
-      ctx->SetOutputDim("SavedMean3", bn2_param_dims);
-      ctx->SetOutputDim("SavedInvstd3", bn2_param_dims);
-      ctx->SetOutputDim("Mean3Out", bn2_param_dims);
-      ctx->SetOutputDim("Var3Out", bn2_param_dims);
-    }
-
-    bool find_max = ctx->Attrs().Get<bool>("find_conv_input_max");
-    if (find_max) {
-      auto max_dims = common::make_ddim({6});
-      ctx->SetOutputDim("MaxInput1", max_dims);
-      ctx->SetOutputDim("MaxFilter1", max_dims);
-      ctx->SetOutputDim("MaxInput2", max_dims);
-      ctx->SetOutputDim("MaxFilter2", max_dims);
-      if (has_shortcut) {
-        ctx->SetOutputDim("MaxInput3", max_dims);
-        ctx->SetOutputDim("MaxFilter3", max_dims);
-      }
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const {
-    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-    // By default, the type of the scale, bias, mean,
-    // and var tensors should be float when input tensor's dtype is float16.
-    auto bn_param_type = framework::proto::VarType::FP32;
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(
-            ctx.Input<phi::DenseTensor>("Scale1")->dtype()),
-        phi::errors::InvalidArgument("Scale input should be of float type"));
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(
-            ctx.Input<phi::DenseTensor>("Bias1")->dtype()),
-        phi::errors::InvalidArgument("Bias input should be of float type"));
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(
-            ctx.Input<phi::DenseTensor>("Scale2")->dtype()),
-        phi::errors::InvalidArgument("Scale input should be of float type"));
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(
-            ctx.Input<phi::DenseTensor>("Bias2")->dtype()),
-        phi::errors::InvalidArgument("Bias input should be of float type"));
-
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-
-class ResNetBasicBlockOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    //  has_shortcut = True:       else:
-    //          X                         X
-    //        /                         /
-    //      |       |                 |       |
-    //    CONV1     |               CONV1     |
-    //      |       |                 |       |
-    //     BN1      |                BN1      |
-    //      |       |                 |       |
-    //    RELU1     |               RELU1     |
-    //      |       |                 |       |
-    //    CONV2   CONV3             CONV2     |
-    //      |       |                 |       |
-    //     BN2     BN3               BN2      |
-    //      \       /                 \       /
-    //         ADD                       ADD
-    //          |                         |
-    //         RELU                      RELU
-    //          |                         |
-    //          Y                         Y
-    AddInput("X", "Input tensor of conv 1");
-    AddInput("Filter1", "Filter tensor of conv 1");
-    AddInput("Scale1", "Scale tensor of bn 1");
-    AddInput("Bias1", "Bias tensor of bn 1");
-    AddInput("Mean1", "Mean tensor of bn 1");
-    AddInput("Var1", "Variance tensor of bn 1");
-    AddInput("Filter2", "Filter tensor of conv 2");
-    AddInput("Scale2", "Scale tensor of bn 2");
-    AddInput("Bias2", "Bias tensor of bn 2");
-    AddInput("Mean2", "Mean tensor of bn 2");
-    AddInput("Var2", "Variance tensor of bn 2");
-    AddInput("Filter3", "Filter tensor of conv 3").AsDispensable();
-    AddInput("Scale3", "Scale tensor of bn 3").AsDispensable();
-    AddInput("Bias3", "Bias tensor of bn 3").AsDispensable();
-    AddInput("Mean3", "Mean tensor of bn 3").AsDispensable();
-    AddInput("Var3", "Variance tensor of bn 3").AsDispensable();
-    AddOutput("Y", "The result of ssd resnet unit");
-    AddOutput("Conv1", "The result of conv 1");
-    AddOutput("SavedMean1", "Mean of input 1 after conv 1");
-    AddOutput("SavedInvstd1", "Invstd of input 1 after conv 1");
-    AddOutput("Mean1Out", "Shared memory with Mean1");
-    AddOutput("Var1Out", "Shared memory with Var1");
-    AddOutput("Conv2", "The result of conv 2");
-    AddOutput("Conv2Input", "Conv2 input data");
-    AddOutput("SavedMean2", "Mean of input 2 after conv 2");
-    AddOutput("SavedInvstd2", "Invstd of input 2 after conv 2");
-    AddOutput("Mean2Out", "Shared memory with Mean2");
-    AddOutput("Var2Out", "Shared memory with Var2");
-    AddOutput("Conv3", "The result of conv 3").AsDispensable();
-    AddOutput("SavedMean3", "Mean of input 3 after conv 3").AsDispensable();
-    AddOutput("SavedInvstd3", "Invstd of input 3 after conv 3").AsDispensable();
-    AddOutput("Mean3Out", "Shared memory with Mean3").AsDispensable();
-    AddOutput("Var3Out", "Shared memory with Var3").AsDispensable();
-    AddOutput("MaxInput1", "The max value of conv1 input tensor")
-        .AsDispensable();
-    AddOutput("MaxFilter1", "The max value of conv1 filter tensor")
-        .AsDispensable();
-    AddOutput("MaxInput2", "The max value of conv2 input tensor")
-        .AsDispensable();
-    AddOutput("MaxFilter2", "The max value of conv2 filter tensor")
-        .AsDispensable();
-    AddOutput("MaxInput3", "The max value of conv3 input tensor")
-        .AsDispensable();
-    AddOutput("MaxFilter3", "The max value of conv3 filter tensor")
-        .AsDispensable();
-    AddAttr<int>("stride1", "Stride of conv1").SetDefault(1);
-    AddAttr<int>("stride2", "Stride of conv2").SetDefault(1);
-    AddAttr<int>("stride3", "Stride of conv3").SetDefault(1);
-    AddAttr<int>("padding1", "Padding of conv1").SetDefault(0);
-    AddAttr<int>("padding2", "Padding of conv2").SetDefault(0);
-    AddAttr<int>("padding3", "Padding of conv3").SetDefault(0);
-    AddAttr<int>("dilation1", "Dilation of conv1").SetDefault(1);
-    AddAttr<int>("dilation2", "Dilation of conv2").SetDefault(1);
-    AddAttr<int>("dilation3", "Dilation of conv3").SetDefault(1);
-    AddAttr<int>("group", "Group of all the 3 conv").SetDefault(1);
-    AddAttr<float>("momentum", "Momentum of all the 3 bn").SetDefault(0.9);
-    AddAttr<float>("epsilon", "Epsilon of all the 3 bn").SetDefault(1e-5);
-    AddAttr<std::string>("data_format", "").SetDefault("NCHW");
-    AddAttr<bool>("has_shortcut", "").SetDefault(false);
-    AddAttr<bool>("use_global_stats", "").SetDefault(false);
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "trainable_statistics",
-        "(bool, default false) Whether to calculate mean and variance "
-        "in test mode. If setting true in test mode, mean and variace "
-        "will be calculated by current batch statistics.")
-        .SetDefault(false);
-    AddAttr<std::string>("act_type", "The activation type to be fused.")
-        .SetDefault("relu");
-    AddAttr<bool>("find_conv_input_max",
-                  "(bool, default true) Whether to calculate max value of conv "
-                  "input tensor.")
-        .SetDefault(true);
-    AddComment(R"DOC(
-Fusion op of the basic unit of ssd resnet block.
-** This is only use for XPU, if has problems, concat zhangyikun02@baidu.com **
-)DOC");
-  }
-};
-
-template <typename T>
-class ResNetBasicBlockGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("resnet_basic_block_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Filter1", this->Input("Filter1"));
-    op->SetInput("Conv1", this->Output("Conv1"));
-    op->SetInput("Scale1", this->Input("Scale1"));
-    op->SetInput("Bias1", this->Input("Bias1"));
-    op->SetInput("SavedMean1", this->Output("SavedMean1"));
-    op->SetInput("SavedInvstd1", this->Output("SavedInvstd1"));
-    op->SetInput("Filter2", this->Input("Filter2"));
-    op->SetInput("Conv2", this->Output("Conv2"));
-    op->SetInput("Conv2Input", this->Output("Conv2Input"));
-    op->SetInput("Scale2", this->Input("Scale2"));
-    op->SetInput("Bias2", this->Input("Bias2"));
-    op->SetInput("SavedMean2", this->Output("SavedMean2"));
-    op->SetInput("SavedInvstd2", this->Output("SavedInvstd2"));
-    op->SetInput("Filter3", this->Input("Filter3"));
-    op->SetInput("Conv3", this->Output("Conv3"));
-    op->SetInput("Scale3", this->Input("Scale3"));
-    op->SetInput("Bias3", this->Input("Bias3"));
-    op->SetInput("SavedMean3", this->Output("SavedMean3"));
-    op->SetInput("SavedInvstd3", this->Output("SavedInvstd3"));
-    op->SetInput("MaxInput1", this->Output("MaxInput1"));
-    op->SetInput("MaxFilter1", this->Output("MaxFilter1"));
-    op->SetInput("MaxInput2", this->Output("MaxInput2"));
-    op->SetInput("MaxFilter2", this->Output("MaxFilter2"));
-    op->SetInput("MaxInput3", this->Output("MaxInput3"));
-    op->SetInput("MaxFilter3", this->Output("MaxFilter3"));
-    op->SetInput("Y", this->Output("Y"));
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-
-    op->SetAttrMap(this->Attrs());
-
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Filter1"),
-                  this->InputGrad("Filter1"));
-    op->SetOutput(framework::GradVarName("Scale1"), this->InputGrad("Scale1"));
-    op->SetOutput(framework::GradVarName("Bias1"), this->InputGrad("Bias1"));
-    op->SetOutput(framework::GradVarName("Filter2"),
-                  this->InputGrad("Filter2"));
-    op->SetOutput(framework::GradVarName("Scale2"), this->InputGrad("Scale2"));
-    op->SetOutput(framework::GradVarName("Bias2"), this->InputGrad("Bias2"));
-    op->SetOutput(framework::GradVarName("Filter3"),
-                  this->InputGrad("Filter3"));
-    op->SetOutput(framework::GradVarName("Scale3"), this->InputGrad("Scale3"));
-    op->SetOutput(framework::GradVarName("Bias3"), this->InputGrad("Bias3"));
-  }
-};
-
-class ResNetBasicBlockOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
-      const override {
-    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Y"}};
-    return m;
-  }
-};
-
-class ResNetBasicBlockGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const {
-    // check input
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Filter1"), "Input", "Filter1", "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Conv1"), "Input", "Conv1", "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Scale1"), "Input", "Scale1", "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Bias1"), "Input", "Bias1", "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(ctx->HasInput("SavedMean1"),
-                   "Input",
-                   "SavedMean1",
-                   "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(ctx->HasInput("SavedInvstd1"),
-                   "Input",
-                   "SavedInvstd1",
-                   "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Filter2"), "Input", "Filter2", "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Conv2"), "Input", "Conv2", "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Scale2"), "Input", "Scale2", "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Bias2"), "Input", "Bias2", "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(ctx->HasInput("SavedMean2"),
-                   "Input",
-                   "SavedMean2",
-                   "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(ctx->HasInput("SavedInvstd2"),
-                   "Input",
-                   "SavedInvstd2",
-                   "ResNetBasicBlockGradOp");
-    bool has_shortcut = ctx->Attrs().Get<bool>("has_shortcut");
-    if (has_shortcut) {
-      OP_INOUT_CHECK(ctx->HasInput("Filter3"),
-                     "Input",
-                     "Filter3",
-                     "ResNetBasicBlockGradOp");
-      OP_INOUT_CHECK(
-          ctx->HasInput("Scale3"), "Input", "Scale3", "ResNetBasicBlockGradOp");
-      OP_INOUT_CHECK(
-          ctx->HasInput("Bias3"), "Input", "Bias3", "ResNetBasicBlockGradOp");
-    }
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input",
-                   framework::GradVarName("Y"),
-                   "ResNetBasicBlockGradOp");
-
-    // check output
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Filter1")),
-                   "Output",
-                   framework::GradVarName("Filter1"),
-                   "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale1")),
-                   "Output",
-                   framework::GradVarName("Scale1"),
-                   "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias1")),
-                   "Output",
-                   framework::GradVarName("Bias1"),
-                   "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Filter2")),
-                   "Output",
-                   framework::GradVarName("Filter2"),
-                   "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale2")),
-                   "Output",
-                   framework::GradVarName("Scale2"),
-                   "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias2")),
-                   "Output",
-                   framework::GradVarName("Bias2"),
-                   "ResNetBasicBlockGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   framework::GradVarName("X"),
-                   "ResNetBasicBlockGradOp");
-    if (has_shortcut) {
-      OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Filter3")),
-                     "Output",
-                     framework::GradVarName("Filter3"),
-                     "ResNetBasicBlockGradOp");
-      OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale3")),
-                     "Output",
-                     framework::GradVarName("Scale3"),
-                     "ResNetBasicBlockGradOp");
-      OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias3")),
-                     "Output",
-                     framework::GradVarName("Bias3"),
-                     "ResNetBasicBlockGradOp");
-    }
-
-    const auto x1_dims = ctx->GetInputDim("X");
-    const auto filter1_x_dims = ctx->GetInputDim("Filter1");
-    const auto param1_dims = ctx->GetInputDim("Scale1");
-    const auto filter2_x_dims = ctx->GetInputDim("Filter2");
-    const auto param2_dims = ctx->GetInputDim("Scale2");
-    ctx->SetOutputDim(framework::GradVarName("X"), x1_dims);
-    ctx->SetOutputDim(framework::GradVarName("Filter1"), filter1_x_dims);
-    ctx->SetOutputDim(framework::GradVarName("Scale1"), param1_dims);
-    ctx->SetOutputDim(framework::GradVarName("Bias1"), param1_dims);
-    ctx->SetOutputDim(framework::GradVarName("Filter2"), filter2_x_dims);
-    ctx->SetOutputDim(framework::GradVarName("Scale2"), param2_dims);
-    ctx->SetOutputDim(framework::GradVarName("Bias2"), param2_dims);
-    if (has_shortcut) {
-      const auto filter_z_dims = ctx->GetInputDim("Filter3");
-      ctx->SetOutputDim(framework::GradVarName("Filter3"), filter_z_dims);
-      ctx->SetOutputDim(framework::GradVarName("Scale3"), param2_dims);
-      ctx->SetOutputDim(framework::GradVarName("Bias3"), param2_dims);
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar(framework::GradVarName("Y")),
-        phi::errors::NotFound("Can not find Y@GRAD in the execution context."));
-
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(resnet_basic_block,
-                  ops::ResNetBasicBlockOp,
-                  ops::ResNetBasicBlockOpMaker,
-                  ops::ResNetBasicBlockOpInferVarType,
-                  ops::ResNetBasicBlockGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ResNetBasicBlockGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(resnet_basic_block_grad, ops::ResNetBasicBlockGradOp);
diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
deleted file mode 100644
index 50a3b3c46137d..0000000000000
--- a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
+++ /dev/null
@@ -1,1007 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
-
-namespace paddle {
-namespace operators {
-
-class ResnetBasicBlockAttr {
- public:
-  explicit ResnetBasicBlockAttr(const framework::ExecutionContext& ctx) {
-    padding1 = ctx.Attr<int>("padding1");
-    padding2 = ctx.Attr<int>("padding2");
-    padding3 = ctx.Attr<int>("padding3");
-    stride1 = ctx.Attr<int>("stride1");
-    stride2 = ctx.Attr<int>("stride2");
-    stride3 = ctx.Attr<int>("stride3");
-    dilation1 = ctx.Attr<int>("dilation1");
-    dilation2 = ctx.Attr<int>("dilation2");
-    dilation3 = ctx.Attr<int>("dilation3");
-    group = ctx.Attr<int>("group");
-
-    eps = static_cast<double>(ctx.Attr<float>("epsilon"));
-    momentum = static_cast<double>(ctx.Attr<float>("momentum"));
-    has_shortcut = ctx.Attr<bool>("has_shortcut");
-    find_max = ctx.Attr<bool>("find_conv_input_max");
-
-    const auto is_test = ctx.Attr<bool>("is_test");
-    const auto use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const auto trainable_stats = ctx.Attr<bool>("trainable_statistics");
-    bool test_mode = is_test && (!trainable_stats);
-    global_stats = test_mode || use_global_stats;
-
-    // init shape
-    auto input1 = ctx.Input<phi::DenseTensor>("X");
-    auto filter1 = ctx.Input<phi::DenseTensor>("Filter1");
-    auto conv1_out = ctx.Output<phi::DenseTensor>("Conv1");
-    auto filter2 = ctx.Input<phi::DenseTensor>("Filter2");
-    auto conv2_out = ctx.Output<phi::DenseTensor>("Conv2");
-    conv1_input_shape = common::vectorize<int>(input1->dims());
-    conv1_output_shape = common::vectorize<int>(conv1_out->dims());
-    conv1_filter_shape = common::vectorize<int>(filter1->dims());
-    conv1_filter_numel = filter1->numel();
-    conv1_input_numel = input1->numel();
-    conv1_output_numel = conv1_out->numel();
-
-    conv2_input_shape = common::vectorize<int>(conv1_out->dims());
-    conv2_output_shape = common::vectorize<int>(conv2_out->dims());
-    conv2_filter_shape = common::vectorize<int>(filter2->dims());
-    conv2_filter_numel = filter2->numel();
-    conv2_input_numel = conv1_out->numel();
-    conv2_output_numel = conv2_out->numel();
-
-    if (has_shortcut) {
-      auto filter3 = ctx.Input<phi::DenseTensor>("Filter3");
-      auto conv3_out = ctx.Output<phi::DenseTensor>("Conv3");
-      conv3_input_shape = common::vectorize<int>(input1->dims());
-      conv3_output_shape = common::vectorize<int>(conv3_out->dims());
-      conv3_filter_shape = common::vectorize<int>(filter3->dims());
-      conv3_filter_numel = filter3->numel();
-      conv3_input_numel = input1->numel();
-      conv3_output_numel = conv3_out->numel();
-    }
-  }
-
-  int padding1;
-  int padding2;
-  int padding3;
-  int stride1;
-  int stride2;
-  int stride3;
-  int dilation1;
-  int dilation2;
-  int dilation3;
-  int group;
-
-  double eps;
-  double momentum;
-
-  bool has_shortcut;
-  bool find_max;
-  bool global_stats;
-
-  std::vector<int> conv1_input_shape;
-  std::vector<int> conv1_output_shape;
-  std::vector<int> conv1_filter_shape;
-  std::vector<int> conv2_input_shape;
-  std::vector<int> conv2_output_shape;
-  std::vector<int> conv2_filter_shape;
-  std::vector<int> conv3_input_shape;
-  std::vector<int> conv3_output_shape;
-  std::vector<int> conv3_filter_shape;
-
-  int conv1_filter_numel;
-  int conv2_filter_numel;
-  int conv3_filter_numel;
-  int conv1_input_numel;
-  int conv2_input_numel;
-  int conv3_input_numel;
-  int conv1_output_numel;
-  int conv2_output_numel;
-  int conv3_output_numel;
-};
-
-class ResnetBasicBlockGradAttr {
- public:
-  explicit ResnetBasicBlockGradAttr(const framework::ExecutionContext& ctx) {
-    padding1 = ctx.Attr<int>("padding1");
-    padding2 = ctx.Attr<int>("padding2");
-    padding3 = ctx.Attr<int>("padding3");
-    stride1 = ctx.Attr<int>("stride1");
-    stride2 = ctx.Attr<int>("stride2");
-    stride3 = ctx.Attr<int>("stride3");
-    dilation1 = ctx.Attr<int>("dilation1");
-    dilation2 = ctx.Attr<int>("dilation2");
-    dilation3 = ctx.Attr<int>("dilation3");
-    group = ctx.Attr<int>("group");
-
-    has_shortcut = ctx.Attr<bool>("has_shortcut");
-    find_max = ctx.Attr<bool>("find_conv_input_max");
-
-    // init shape
-    auto input1 = ctx.Input<phi::DenseTensor>("X");
-    auto filter1 = ctx.Input<phi::DenseTensor>("Filter1");
-    auto conv1_out = ctx.Input<phi::DenseTensor>("Conv1");
-    auto filter2 = ctx.Input<phi::DenseTensor>("Filter2");
-    auto conv2_out = ctx.Input<phi::DenseTensor>("Conv2");
-    conv1_input_shape = common::vectorize<int>(input1->dims());
-    conv1_output_shape = common::vectorize<int>(conv1_out->dims());
-    conv1_filter_shape = common::vectorize<int>(filter1->dims());
-    conv1_filter_numel = filter1->numel();
-    conv1_input_numel = input1->numel();
-    conv1_output_numel = conv1_out->numel();
-
-    conv2_input_shape = common::vectorize<int>(conv1_out->dims());
-    conv2_output_shape = common::vectorize<int>(conv2_out->dims());
-    conv2_filter_shape = common::vectorize<int>(filter2->dims());
-    conv2_filter_numel = filter2->numel();
-    conv2_input_numel = conv1_out->numel();
-    conv2_output_numel = conv2_out->numel();
-
-    if (has_shortcut) {
-      auto filter3 = ctx.Input<phi::DenseTensor>("Filter3");
-      auto conv3_out = ctx.Input<phi::DenseTensor>("Conv3");
-      conv3_input_shape = common::vectorize<int>(input1->dims());
-      conv3_output_shape = common::vectorize<int>(conv3_out->dims());
-      conv3_filter_shape = common::vectorize<int>(filter3->dims());
-      conv3_filter_numel = filter3->numel();
-      conv3_input_numel = input1->numel();
-      conv3_output_numel = conv3_out->numel();
-    }
-  }
-
-  int padding1;
-  int padding2;
-  int padding3;
-  int stride1;
-  int stride2;
-  int stride3;
-  int dilation1;
-  int dilation2;
-  int dilation3;
-  int group;
-
-  bool has_shortcut;
-  bool find_max;
-
-  std::vector<int> conv1_input_shape;
-  std::vector<int> conv1_output_shape;
-  std::vector<int> conv1_filter_shape;
-  std::vector<int> conv2_input_shape;
-  std::vector<int> conv2_output_shape;
-  std::vector<int> conv2_filter_shape;
-  std::vector<int> conv3_input_shape;
-  std::vector<int> conv3_output_shape;
-  std::vector<int> conv3_filter_shape;
-
-  int conv1_filter_numel;
-  int conv2_filter_numel;
-  int conv3_filter_numel;
-  int conv1_input_numel;
-  int conv2_input_numel;
-  int conv3_input_numel;
-  int conv1_output_numel;
-  int conv2_output_numel;
-  int conv3_output_numel;
-};
-
-template <typename T>
-static inline void xpu_conv2d(xpu::Context* ctx,
-                              const T* input_data,
-                              const T* filter_data,
-                              T* output_data,
-                              float* input_max_data,
-                              float* filter_max_data,
-                              const std::vector<int>& input_shape,
-                              const std::vector<int>& filter_shape,
-                              int padding,
-                              int stride,
-                              int dilation,
-                              int group) {
-  std::vector<int> ksize{filter_shape[2], filter_shape[3]};
-  std::vector<int> stride_vec{stride, stride};
-  std::vector<int> dilation_vec{dilation, dilation};
-  std::vector<int> padding_vec{padding, padding};
-  int N = input_shape[0];
-  int C = input_shape[1];
-  int H = input_shape[2];
-  int W = input_shape[3];
-
-  int r = xpu::conv2d<T, T, T, int16_t>(ctx,
-                                        input_data,
-                                        filter_data,
-                                        output_data,
-                                        N,
-                                        C,
-                                        H,
-                                        W,
-                                        filter_shape[0],
-                                        ksize,
-                                        stride_vec,
-                                        padding_vec,
-                                        dilation_vec,
-                                        group,
-                                        input_max_data,
-                                        filter_max_data,
-                                        nullptr,
-                                        true);
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d");
-}
-
-template <typename T>
-static inline void xpu_conv2d_grad(xpu::Context* ctx,
-                                   const T* input_data,
-                                   const T* filter_data,
-                                   const T* output_grad_data,
-                                   T* input_grad_data,
-                                   T* filter_grad_data,
-                                   const float* input_max_data,
-                                   const float* filter_max_data,
-                                   const std::vector<int>& input_shape,
-                                   const std::vector<int>& filter_shape,
-                                   int padding,
-                                   int stride,
-                                   int dilation,
-                                   int group) {
-  std::vector<int> ksize{filter_shape[2], filter_shape[3]};
-  std::vector<int> stride_vec{stride, stride};
-  std::vector<int> dilation_vec{dilation, dilation};
-  std::vector<int> padding_vec{padding, padding};
-  int N = input_shape[0];
-  int C = input_shape[1];
-  int H = input_shape[2];
-  int W = input_shape[3];
-
-  int r = xpu::conv2d_grad<T, T, T, int16_t>(ctx,
-                                             input_data,
-                                             filter_data,
-                                             output_grad_data,
-                                             input_grad_data,
-                                             filter_grad_data,
-                                             N,
-                                             C,
-                                             H,
-                                             W,
-                                             filter_shape[0],
-                                             ksize,
-                                             stride_vec,
-                                             padding_vec,
-                                             dilation_vec,
-                                             group,
-                                             input_max_data,
-                                             filter_max_data,
-                                             nullptr,
-                                             nullptr,
-                                             nullptr,
-                                             true);
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_grad");
-}
-
-template <typename T, typename DeviceContext>
-class ResNetBasicBlockXPUKernel : public framework::OpKernel<T> {
- public:
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_xpu_place(ctx.GetPlace()),
-                      true,
-                      phi::errors::PreconditionNotMet("It must use XPUPlace."));
-
-    // input
-    const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor* filter1 = ctx.Input<phi::DenseTensor>("Filter1");
-    const phi::DenseTensor* scale1 = ctx.Input<phi::DenseTensor>("Scale1");
-    const phi::DenseTensor* bias1 = ctx.Input<phi::DenseTensor>("Bias1");
-    const phi::DenseTensor* filter2 = ctx.Input<phi::DenseTensor>("Filter2");
-    const phi::DenseTensor* scale2 = ctx.Input<phi::DenseTensor>("Scale2");
-    const phi::DenseTensor* bias2 = ctx.Input<phi::DenseTensor>("Bias2");
-
-    // output
-    phi::DenseTensor* conv1_output = ctx.Output<phi::DenseTensor>("Conv1");
-    phi::DenseTensor* conv2_output = ctx.Output<phi::DenseTensor>("Conv2");
-    phi::DenseTensor* conv2_input = ctx.Output<phi::DenseTensor>("Conv2Input");
-    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Y");
-
-    auto place = ctx.GetPlace();
-    auto x_data = reinterpret_cast<const XPUType*>(x->data<T>());
-    auto conv1_filter_data =
-        reinterpret_cast<const XPUType*>(filter1->data<T>());
-    auto conv2_filter_data =
-        reinterpret_cast<const XPUType*>(filter2->data<T>());
-    auto conv1_output_data =
-        reinterpret_cast<XPUType*>(conv1_output->mutable_data<T>(place));
-    auto conv2_input_data =
-        reinterpret_cast<XPUType*>(conv2_input->mutable_data<T>(place));
-    auto conv2_output_data =
-        reinterpret_cast<XPUType*>(conv2_output->mutable_data<T>(place));
-    auto scale1_data = scale1->data<float>();
-    auto scale2_data = scale2->data<float>();
-    auto bias1_data = bias1->data<float>();
-    auto bias2_data = bias2->data<float>();
-    auto output_data =
-        reinterpret_cast<XPUType*>(output->mutable_data<T>(place));
-
-    float* conv1_input_max_data = nullptr;
-    float* conv1_filter_max_data = nullptr;
-    float* conv2_input_max_data = nullptr;
-    float* conv2_filter_max_data = nullptr;
-    float* conv3_input_max_data = nullptr;
-    float* conv3_filter_max_data = nullptr;
-
-    ResnetBasicBlockAttr attr(ctx);
-
-    // init find max
-    if (attr.find_max) {
-      phi::DenseTensor* max_input1 = ctx.Output<phi::DenseTensor>("MaxInput1");
-      phi::DenseTensor* max_filter1 =
-          ctx.Output<phi::DenseTensor>("MaxFilter1");
-      conv1_input_max_data = max_input1->mutable_data<float>(place);
-      conv1_filter_max_data = max_filter1->mutable_data<float>(place);
-
-      phi::DenseTensor* max_input2 = ctx.Output<phi::DenseTensor>("MaxInput2");
-      phi::DenseTensor* max_filter2 =
-          ctx.Output<phi::DenseTensor>("MaxFilter2");
-      conv2_input_max_data = max_input2->mutable_data<float>(place);
-      conv2_filter_max_data = max_filter2->mutable_data<float>(place);
-
-      if (attr.has_shortcut) {
-        phi::DenseTensor* max_input3 =
-            ctx.Output<phi::DenseTensor>("MaxInput3");
-        phi::DenseTensor* max_filter3 =
-            ctx.Output<phi::DenseTensor>("MaxFilter3");
-        conv3_input_max_data = max_input3->mutable_data<float>(place);
-        conv3_filter_max_data = max_filter3->mutable_data<float>(place);
-      }
-    }
-
-    auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
-    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-    int r = XPU_SUCCESS;
-
-    // 1. short
-    const XPUType* z_out_data = nullptr;
-    if (attr.has_shortcut) {
-      phi::DenseTensor* conv3_out = ctx.Output<phi::DenseTensor>("Conv3");
-      const phi::DenseTensor* filter3 = ctx.Input<phi::DenseTensor>("Filter3");
-      auto conv3_filter_data =
-          reinterpret_cast<const XPUType*>(filter3->data<T>());
-      auto conv3_output_data =
-          reinterpret_cast<XPUType*>(conv3_out->mutable_data<T>(place));
-
-      XPUType* conv3_input_l3_data = nullptr;
-      XPUType* conv3_filter_l3_data =
-          RAII_GUARD.alloc_l3_or_gm<XPUType>(attr.conv3_filter_numel);
-
-      if (attr.find_max) {
-        r = xpu::findmax_copy_fusion(dev_ctx.x_context(),
-                                     x_data,
-                                     conv3_input_max_data,
-                                     conv3_input_l3_data,
-                                     attr.conv3_input_numel);
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion");
-
-        r = xpu::findmax_copy_fusion(dev_ctx.x_context(),
-                                     conv3_filter_data,
-                                     conv3_filter_max_data,
-                                     conv3_filter_l3_data,
-                                     attr.conv3_filter_numel);
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion");
-      }
-
-      xpu_conv2d(dev_ctx.x_context(),
-                 conv3_input_l3_data != nullptr ? conv3_input_l3_data : x_data,
-                 conv3_filter_l3_data,
-                 conv3_output_data,
-                 conv3_input_max_data,
-                 conv3_filter_max_data,
-                 attr.conv3_input_shape,
-                 attr.conv3_filter_shape,
-                 attr.padding3,
-                 attr.stride3,
-                 attr.dilation3,
-                 attr.group);
-
-      // bn3
-      const phi::DenseTensor* scale3 = ctx.Input<phi::DenseTensor>("Scale3");
-      const phi::DenseTensor* bias3 = ctx.Input<phi::DenseTensor>("Bias3");
-      auto bias3_data = bias3->data<float>();
-      auto scale3_data = scale3->data<float>();
-
-      auto bn3_output_data = RAII_GUARD.alloc<XPUType>(attr.conv3_output_numel);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(bn3_output_data);
-
-      if (!attr.global_stats) {
-        phi::DenseTensor* saved_mean3 =
-            ctx.Output<phi::DenseTensor>("SavedMean3");
-        phi::DenseTensor* saved_invstd3 =
-            ctx.Output<phi::DenseTensor>("SavedInvstd3");
-        phi::DenseTensor* running_mean3 =
-            ctx.Output<phi::DenseTensor>("Mean3Out");
-        phi::DenseTensor* running_var3 =
-            ctx.Output<phi::DenseTensor>("Var3Out");
-
-        auto saved_mean3_data = saved_mean3->mutable_data<float>(place);
-        auto saved_invstd3_data = saved_invstd3->mutable_data<float>(place);
-        auto running_mean3_data = running_mean3->mutable_data<float>(place);
-        auto running_var3_data = running_var3->mutable_data<float>(place);
-
-        r = xpu::batch_norm_fusion<XPUType>(dev_ctx.x_context(),
-                                            conv3_output_data,
-                                            bn3_output_data,
-                                            attr.conv3_output_shape[0],
-                                            attr.conv3_output_shape[1],
-                                            attr.conv3_output_shape[3],
-                                            attr.conv3_output_shape[3],
-                                            attr.eps,
-                                            attr.momentum,
-                                            scale3_data,
-                                            bias3_data,
-                                            saved_mean3_data,
-                                            saved_invstd3_data,
-                                            running_mean3_data,
-                                            running_var3_data,
-                                            true,
-                                            nullptr,
-                                            xpu::Activation_t::LINEAR,
-                                            nullptr,
-                                            0);
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_fusion");
-      } else {
-        const auto* mean3 = ctx.Input<phi::DenseTensor>("Mean3");
-        const auto* var3 = ctx.Input<phi::DenseTensor>("Var3");
-        const auto* mean3_data = mean3->data<float>();
-        const auto* variance3_data = var3->data<float>();
-        r = xpu::batch_norm_infer<XPUType>(dev_ctx.x_context(),
-                                           conv3_output_data,
-                                           bn3_output_data,
-                                           attr.conv3_output_shape[0],
-                                           attr.conv3_output_shape[1],
-                                           attr.conv3_output_shape[2],
-                                           attr.conv3_output_shape[3],
-                                           attr.eps,
-                                           scale3_data,
-                                           bias3_data,
-                                           mean3_data,
-                                           variance3_data,
-                                           true);
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_infer");
-      }
-      z_out_data = reinterpret_cast<const XPUType*>(bn3_output_data);
-    } else {
-      z_out_data = x_data;
-    }
-
-    // 2. conv1
-    XPUType* conv1_input_l3_data = nullptr;
-    XPUType* conv1_filter_l3_data =
-        RAII_GUARD.alloc_l3_or_gm<XPUType>(attr.conv1_filter_numel);
-    if (attr.find_max) {
-      r = xpu::findmax_copy_fusion(dev_ctx.x_context(),
-                                   x_data,
-                                   conv1_input_max_data,
-                                   conv1_input_l3_data,
-                                   attr.conv1_input_numel);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion");
-
-      r = xpu::findmax_copy_fusion(dev_ctx.x_context(),
-                                   conv1_filter_data,
-                                   conv1_filter_max_data,
-                                   conv1_filter_l3_data,
-                                   attr.conv1_filter_numel);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion");
-    }
-    xpu_conv2d(dev_ctx.x_context(),
-               conv1_input_l3_data != nullptr ? conv1_input_l3_data : x_data,
-               conv1_filter_l3_data,
-               conv1_output_data,
-               conv1_input_max_data,
-               conv1_filter_max_data,
-               attr.conv1_input_shape,
-               attr.conv1_filter_shape,
-               attr.padding1,
-               attr.stride1,
-               attr.dilation1,
-               attr.group);
-
-    // 3. bn1 + relu
-    if (!attr.global_stats) {
-      phi::DenseTensor* saved_mean1 =
-          ctx.Output<phi::DenseTensor>("SavedMean1");
-      phi::DenseTensor* saved_invstd1 =
-          ctx.Output<phi::DenseTensor>("SavedInvstd1");
-      phi::DenseTensor* running_mean1 =
-          ctx.Output<phi::DenseTensor>("Mean1Out");
-      phi::DenseTensor* running_var1 = ctx.Output<phi::DenseTensor>("Var1Out");
-
-      auto saved_mean1_data = saved_mean1->mutable_data<float>(place);
-      auto saved_invstd1_data = saved_invstd1->mutable_data<float>(place);
-      auto running_mean1_data = running_mean1->mutable_data<float>(place);
-      auto running_var1_data = running_var1->mutable_data<float>(place);
-
-      r = xpu::batch_norm_fusion<XPUType>(dev_ctx.x_context(),
-                                          conv1_output_data,
-                                          conv2_input_data,
-                                          attr.conv1_output_shape[0],
-                                          attr.conv1_output_shape[1],
-                                          attr.conv1_output_shape[2],
-                                          attr.conv1_output_shape[3],
-                                          attr.eps,
-                                          attr.momentum,
-                                          scale1_data,
-                                          bias1_data,
-                                          saved_mean1_data,
-                                          saved_invstd1_data,
-                                          running_mean1_data,
-                                          running_var1_data,
-                                          true,
-                                          nullptr,
-                                          xpu::Activation_t::RELU,
-                                          nullptr,
-                                          0);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_fusion");
-    } else {
-      // bn --> relu
-      auto bn1_output_data = RAII_GUARD.alloc<XPUType>(attr.conv1_output_numel);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(bn1_output_data);
-
-      const auto* mean1 = ctx.Input<phi::DenseTensor>("Mean1");
-      const auto* var1 = ctx.Input<phi::DenseTensor>("Var1");
-      const auto* mean_data = mean1->data<float>();
-      const auto* variance_data = var1->data<float>();
-      r = xpu::batch_norm_infer<XPUType>(dev_ctx.x_context(),
-                                         conv1_output_data,
-                                         bn1_output_data,
-                                         attr.conv1_output_shape[0],
-                                         attr.conv1_output_shape[1],
-                                         attr.conv1_output_shape[2],
-                                         attr.conv1_output_shape[3],
-                                         attr.eps,
-                                         scale1_data,
-                                         bias1_data,
-                                         mean_data,
-                                         variance_data,
-                                         true);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_infer");
-
-      r = xpu::relu(dev_ctx.x_context(),
-                    bn1_output_data,
-                    conv2_input_data,
-                    attr.conv1_output_numel);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu");
-    }
-
-    // 4. conv2
-    XPUType* conv2_input_l3_data = nullptr;
-    XPUType* conv2_filter_l3_data =
-        RAII_GUARD.alloc_l3_or_gm<XPUType>(attr.conv2_filter_numel);
-    if (attr.find_max) {
-      phi::DenseTensor* max_input2 = ctx.Output<phi::DenseTensor>("MaxInput2");
-      phi::DenseTensor* max_filter2 =
-          ctx.Output<phi::DenseTensor>("MaxFilter2");
-      conv2_input_max_data = max_input2->mutable_data<float>(place);
-      conv2_filter_max_data = max_filter2->mutable_data<float>(place);
-
-      r = xpu::findmax_copy_fusion(dev_ctx.x_context(),
-                                   conv2_input_data,
-                                   conv2_input_max_data,
-                                   conv2_input_l3_data,
-                                   attr.conv2_input_numel);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion");
-
-      r = xpu::findmax_copy_fusion(dev_ctx.x_context(),
-                                   conv2_filter_data,
-                                   conv2_filter_max_data,
-                                   conv2_filter_l3_data,
-                                   attr.conv2_filter_numel);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion");
-    }
-    xpu_conv2d(
-        dev_ctx.x_context(),
-        conv2_input_l3_data != nullptr ? conv2_input_l3_data : conv2_input_data,
-        conv2_filter_l3_data,
-        conv2_output_data,
-        conv2_input_max_data,
-        conv2_filter_max_data,
-        attr.conv2_input_shape,
-        attr.conv2_filter_shape,
-        attr.padding2,
-        attr.stride2,
-        attr.dilation2,
-        attr.group);
-
-    // 5. bn2
-    if (!attr.global_stats) {
-      phi::DenseTensor* saved_mean2 =
-          ctx.Output<phi::DenseTensor>("SavedMean2");
-      phi::DenseTensor* saved_var2 =
-          ctx.Output<phi::DenseTensor>("SavedInvstd2");
-      phi::DenseTensor* running_mean2 =
-          ctx.Output<phi::DenseTensor>("Mean2Out");
-      phi::DenseTensor* running_var2 = ctx.Output<phi::DenseTensor>("Var2Out");
-
-      auto saved_mean2_data = saved_mean2->mutable_data<float>(place);
-      auto saved_var2_data = saved_var2->mutable_data<float>(place);
-      auto running_mean2_data = running_mean2->mutable_data<float>(place);
-      auto running_var2_data = running_var2->mutable_data<float>(place);
-
-      r = xpu::batch_norm_fusion<XPUType>(dev_ctx.x_context(),
-                                          conv2_output_data,
-                                          output_data,
-                                          attr.conv2_output_shape[0],
-                                          attr.conv2_output_shape[1],
-                                          attr.conv2_output_shape[2],
-                                          attr.conv2_output_shape[3],
-                                          attr.eps,
-                                          attr.momentum,
-                                          scale2_data,
-                                          bias2_data,
-                                          saved_mean2_data,
-                                          saved_var2_data,
-                                          running_mean2_data,
-                                          running_var2_data,
-                                          true,
-                                          z_out_data,
-                                          xpu::Activation_t::RELU,
-                                          nullptr,
-                                          0);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_fusion");
-    } else {
-      auto bn2_out_data = RAII_GUARD.alloc<XPUType>(attr.conv2_output_numel);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(bn2_out_data);
-
-      const auto* mean2 = ctx.Input<phi::DenseTensor>("Mean2");
-      const auto* var2 = ctx.Input<phi::DenseTensor>("Var2");
-      const auto* mean_data = mean2->data<float>();
-      const auto* variance_data = var2->data<float>();
-      r = xpu::batch_norm_infer<XPUType>(dev_ctx.x_context(),
-                                         conv2_output_data,
-                                         bn2_out_data,
-                                         attr.conv2_output_shape[0],
-                                         attr.conv2_output_shape[1],
-                                         attr.conv2_output_shape[2],
-                                         attr.conv2_output_shape[3],
-                                         attr.eps,
-                                         scale2_data,
-                                         bias2_data,
-                                         mean_data,
-                                         variance_data,
-                                         true);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_infer");
-
-      r = xpu::add_activation_fusion<XPUType>(dev_ctx.x_context(),
-                                              bn2_out_data,
-                                              z_out_data,
-                                              output_data,
-                                              output->numel(),
-                                              nullptr,
-                                              nullptr,
-                                              nullptr,
-                                              xpu::Activation_t::RELU);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "add_activation_fusion");
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class ResNetBasicBlockGradXPUKernel : public framework::OpKernel<T> {
- public:
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_xpu_place(ctx.GetPlace()),
-                      true,
-                      phi::errors::PreconditionNotMet("It must use XPUPlace."));
-
-    const phi::DenseTensor* y_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const phi::DenseTensor* y = ctx.Input<phi::DenseTensor>("Y");
-
-    const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor* filter1 = ctx.Input<phi::DenseTensor>("Filter1");
-    const phi::DenseTensor* scale1 = ctx.Input<phi::DenseTensor>("Scale1");
-    const phi::DenseTensor* filter2 = ctx.Input<phi::DenseTensor>("Filter2");
-    const phi::DenseTensor* scale2 = ctx.Input<phi::DenseTensor>("Scale2");
-    const phi::DenseTensor* saved_mean1 =
-        ctx.Input<phi::DenseTensor>("SavedMean1");
-    const phi::DenseTensor* saved_invstd1 =
-        ctx.Input<phi::DenseTensor>("SavedInvstd1");
-    const phi::DenseTensor* saved_mean2 =
-        ctx.Input<phi::DenseTensor>("SavedMean2");
-    const phi::DenseTensor* saved_invstd2 =
-        ctx.Input<phi::DenseTensor>("SavedInvstd2");
-    const phi::DenseTensor* conv1_out = ctx.Input<phi::DenseTensor>("Conv1");
-    const phi::DenseTensor* conv2_out = ctx.Input<phi::DenseTensor>("Conv2");
-    const phi::DenseTensor* conv2_input =
-        ctx.Input<phi::DenseTensor>("Conv2Input");
-
-    const phi::DenseTensor* filter3 = ctx.Input<phi::DenseTensor>("Filter3");
-    const phi::DenseTensor* conv3_out = ctx.Input<phi::DenseTensor>("Conv3");
-    const phi::DenseTensor* scale3 = ctx.Input<phi::DenseTensor>("Scale3");
-    const phi::DenseTensor* saved_mean3 =
-        ctx.Input<phi::DenseTensor>("SavedMean3");
-    const phi::DenseTensor* saved_invstd3 =
-        ctx.Input<phi::DenseTensor>("SavedInvstd3");
-
-    const phi::DenseTensor* conv1_input_max =
-        ctx.Input<phi::DenseTensor>("MaxInput1");
-    const phi::DenseTensor* conv1_filter_max =
-        ctx.Input<phi::DenseTensor>("MaxFilter1");
-    const phi::DenseTensor* conv2_input_max =
-        ctx.Input<phi::DenseTensor>("MaxInput2");
-    const phi::DenseTensor* conv2_filter_max =
-        ctx.Input<phi::DenseTensor>("MaxFilter2");
-    const phi::DenseTensor* conv3_input_max =
-        ctx.Input<phi::DenseTensor>("MaxInput3");
-    const phi::DenseTensor* conv3_filter_max =
-        ctx.Input<phi::DenseTensor>("MaxFilter3");
-
-    phi::DenseTensor* x_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    phi::DenseTensor* filter1_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter1"));
-    phi::DenseTensor* scale1_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale1"));
-    phi::DenseTensor* bias1_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias1"));
-    phi::DenseTensor* filter2_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter2"));
-    phi::DenseTensor* scale2_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale2"));
-    phi::DenseTensor* bias2_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias2"));
-    phi::DenseTensor* filter3_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter3"));
-    phi::DenseTensor* scale3_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale3"));
-    phi::DenseTensor* bias3_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias3"));
-
-    // attrs
-    ResnetBasicBlockGradAttr attr(ctx);
-    auto place = ctx.GetPlace();
-
-    const auto* y_grad_data =
-        reinterpret_cast<const XPUType*>(y_grad->data<T>());
-    const auto* y_data = reinterpret_cast<const XPUType*>(y->data<T>());
-    const auto* x_data = reinterpret_cast<const XPUType*>(x->data<T>());
-    const auto* conv1_output_data =
-        reinterpret_cast<const XPUType*>(conv1_out->data<T>());
-    const auto* conv1_filter_data =
-        reinterpret_cast<const XPUType*>(filter1->data<T>());
-    const auto* conv2_input_data =
-        reinterpret_cast<const XPUType*>(conv2_input->data<T>());
-    const auto* conv2_output_data =
-        reinterpret_cast<const XPUType*>(conv2_out->data<T>());
-    const auto* conv2_filter_data =
-        reinterpret_cast<const XPUType*>(filter2->data<T>());
-
-    const auto* scale2_data = scale2->data<float>();
-    const auto* saved_mean2_data = saved_mean2->data<float>();
-    const auto* saved_invstd2_data = saved_invstd2->data<float>();
-    const auto* scale1_data = scale1->data<float>();
-    const auto* saved_mean1_data = saved_mean1->data<float>();
-    const auto* saved_invstd1_data = saved_invstd1->data<float>();
-    auto* scale2_grad_data = scale2_grad->mutable_data<float>(place);
-    auto* bias2_grad_data = bias2_grad->mutable_data<float>(place);
-
-    const float* conv1_input_max_data = nullptr;
-    const float* conv1_filter_max_data = nullptr;
-    const float* conv2_input_max_data = nullptr;
-    const float* conv2_filter_max_data = nullptr;
-    const float* conv3_input_max_data = nullptr;
-    const float* conv3_filter_max_data = nullptr;
-    if (attr.find_max) {
-      conv1_input_max_data =
-          reinterpret_cast<const float*>(conv1_input_max->data<float>());
-      conv1_filter_max_data =
-          reinterpret_cast<const float*>(conv1_filter_max->data<float>());
-      conv2_input_max_data =
-          reinterpret_cast<const float*>(conv2_input_max->data<float>());
-      conv2_filter_max_data =
-          reinterpret_cast<const float*>(conv2_filter_max->data<float>());
-      if (attr.has_shortcut) {
-        conv3_input_max_data =
-            reinterpret_cast<const float*>(conv3_input_max->data<float>());
-        conv3_filter_max_data =
-            reinterpret_cast<const float*>(conv3_filter_max->data<float>());
-      }
-    }
-
-    auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
-    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-    int r = XPU_SUCCESS;
-
-    // 0. bn2, bn2_fusion grad
-    auto conv2_output_grad_data =
-        RAII_GUARD.alloc<XPUType>(attr.conv2_output_numel);
-    PADDLE_ENFORCE_XDNN_NOT_NULL(conv2_output_grad_data);
-
-    XPUType* z_output_grad_data = nullptr;
-    XPUType* z_grad_data = nullptr;
-    if (!attr.has_shortcut) {
-      z_output_grad_data = RAII_GUARD.alloc<XPUType>(attr.conv1_input_numel);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(z_output_grad_data);
-      z_grad_data = z_output_grad_data;
-    } else {
-      z_output_grad_data = RAII_GUARD.alloc<XPUType>(attr.conv3_output_numel);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(z_output_grad_data);
-
-      z_grad_data = RAII_GUARD.alloc<XPUType>(attr.conv1_input_numel);
-      PADDLE_ENFORCE_XDNN_NOT_NULL(z_grad_data);
-    }
-
-    r = xpu::batch_norm_grad_fusion<XPUType>(dev_ctx.x_context(),
-                                             conv2_output_data,
-                                             y_data,
-                                             y_grad_data,
-                                             conv2_output_grad_data,
-                                             attr.conv2_output_shape[0],
-                                             attr.conv2_output_shape[1],
-                                             attr.conv2_output_shape[2],
-                                             attr.conv2_output_shape[3],
-                                             scale2_data,
-                                             saved_mean2_data,
-                                             saved_invstd2_data,
-                                             scale2_grad_data,
-                                             bias2_grad_data,
-                                             true,
-                                             z_output_grad_data,
-                                             xpu::Activation_t::RELU,
-                                             nullptr,
-                                             0);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_grad_fusion");
-
-    if (attr.has_shortcut) {
-      // bn3 grad
-      const auto* conv3_output_data =
-          reinterpret_cast<const XPUType*>(conv3_out->data<T>());
-      const auto* scale3_data = scale3->data<float>();
-      const auto* saved_mean3_data = saved_mean3->data<float>();
-      const auto* saved_invstd3_data = saved_invstd3->data<float>();
-      auto* scale3_grad_data = scale3_grad->mutable_data<float>(place);
-      auto* bias3_grad_data = bias3_grad->mutable_data<float>(place);
-      auto* conv3_output_grad_data =
-          RAII_GUARD.alloc<XPUType>(attr.conv3_output_numel);
-
-      r = xpu::batch_norm_grad<XPUType>(dev_ctx.x_context(),
-                                        conv3_output_data,
-                                        z_output_grad_data,
-                                        conv3_output_grad_data,
-                                        attr.conv3_output_shape[0],
-                                        attr.conv3_output_shape[1],
-                                        attr.conv3_output_shape[2],
-                                        attr.conv3_output_shape[3],
-                                        scale3_data,
-                                        saved_mean3_data,
-                                        saved_invstd3_data,
-                                        scale3_grad_data,
-                                        bias3_grad_data,
-                                        true);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_grad");
-
-      // conv3 grad
-      auto* conv3_filter_grad_data =
-          reinterpret_cast<XPUType*>(filter3_grad->mutable_data<T>(place));
-      auto* conv3_filter_data =
-          reinterpret_cast<const XPUType*>(filter3->data<T>());
-      xpu_conv2d_grad(dev_ctx.x_context(),
-                      x_data,
-                      conv3_filter_data,
-                      conv3_output_grad_data,
-                      z_grad_data,
-                      conv3_filter_grad_data,
-                      conv3_input_max_data,
-                      conv3_filter_max_data,
-                      attr.conv3_input_shape,
-                      attr.conv3_filter_shape,
-                      attr.padding3,
-                      attr.stride3,
-                      attr.dilation3,
-                      attr.group);
-    }
-
-    // 2. conv2_grad
-    auto* conv2_filter_grad_data =
-        reinterpret_cast<XPUType*>(filter2_grad->mutable_data<T>(place));
-    auto* conv2_input_grad_data =
-        RAII_GUARD.alloc<XPUType>(attr.conv2_input_numel);
-    xpu_conv2d_grad(dev_ctx.x_context(),
-                    conv2_input_data,
-                    conv2_filter_data,
-                    conv2_output_grad_data,
-                    conv2_input_grad_data,
-                    conv2_filter_grad_data,
-                    conv2_input_max_data,
-                    conv2_filter_max_data,
-                    attr.conv2_input_shape,
-                    attr.conv2_filter_shape,
-                    attr.padding2,
-                    attr.stride2,
-                    attr.dilation2,
-                    attr.group);
-
-    // 3. b1 grad
-    auto* conv1_output_grad_data =
-        RAII_GUARD.alloc<XPUType>(attr.conv1_output_numel);
-    PADDLE_ENFORCE_XDNN_NOT_NULL(conv1_output_grad_data);
-    auto* scale1_grad_data = scale1_grad->mutable_data<float>(ctx.GetPlace());
-    auto* bias1_grad_data = bias1_grad->mutable_data<float>(ctx.GetPlace());
-    r = xpu::batch_norm_grad_fusion<XPUType>(dev_ctx.x_context(),
-                                             conv1_output_data,
-                                             conv2_input_data,
-                                             conv2_input_grad_data,
-                                             conv1_output_grad_data,
-                                             attr.conv1_output_shape[0],
-                                             attr.conv1_output_shape[1],
-                                             attr.conv1_output_shape[2],
-                                             attr.conv1_output_shape[3],
-                                             scale1_data,
-                                             saved_mean1_data,
-                                             saved_invstd1_data,
-                                             scale1_grad_data,
-                                             bias1_grad_data,
-                                             true,
-                                             nullptr,
-                                             xpu::Activation_t::RELU,
-                                             nullptr,
-                                             0);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_grad_fusion");
-
-    // 4. conv1_grad
-    auto* x_grad_data =
-        reinterpret_cast<XPUType*>(x_grad->mutable_data<T>(place));
-    auto* conv1_filter_grad_data =
-        reinterpret_cast<XPUType*>(filter1_grad->mutable_data<T>(place));
-    xpu_conv2d_grad(dev_ctx.x_context(),
-                    x_data,
-                    conv1_filter_data,
-                    conv1_output_grad_data,
-                    x_grad_data,
-                    conv1_filter_grad_data,
-                    conv1_input_max_data,
-                    conv1_filter_max_data,
-                    attr.conv1_input_shape,
-                    attr.conv1_filter_shape,
-                    attr.padding1,
-                    attr.stride1,
-                    attr.dilation1,
-                    attr.group);
-
-    // add z_grad to x_grad
-    r = xpu::add<XPUType>(
-        dev_ctx.x_context(), x_grad_data, z_grad_data, x_grad_data, x->numel());
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "add");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-PD_REGISTER_STRUCT_KERNEL(resnet_basic_block,
-                          XPU,
-                          ALL_LAYOUT,
-                          ops::ResNetBasicBlockXPUKernel,
-                          float) {}
-PD_REGISTER_STRUCT_KERNEL(resnet_basic_block_grad,
-                          XPU,
-                          ALL_LAYOUT,
-                          ops::ResNetBasicBlockGradXPUKernel,
-                          float) {}
-#endif
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc
deleted file mode 100644
index d4e9b3f8e4525..0000000000000
--- a/paddle/fluid/operators/fused/resnet_unit_op.cc
+++ /dev/null
@@ -1,465 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/common/float16.h"
-
-namespace paddle {
-namespace operators {
-
-// Shape of bitmask
-static framework::DDim GetBitmaskDims(std::vector<int> out_shape) {
-  int c = out_shape.back();
-  int64_t nhw = std::accumulate(out_shape.begin(),
-                                out_shape.end(),
-                                1,
-                                std::multiplies<int>()) /  // NOLINT
-                c;
-  int32_t c_int32_elems = ((c + 63) & ~63) / 32;
-  int32_t nhw_int32_elems = static_cast<int32_t>(((nhw + 31) & ~31));
-  std::vector<int> bitmask_shape = {nhw_int32_elems, c_int32_elems, 1};
-  return common::make_ddim(bitmask_shape);
-}
-
-class ResNetUnitOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // Check input
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ResNetUnitOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("FilterX"), "Input", "FilterX", "ResNetUnitOp");
-    OP_INOUT_CHECK(ctx->HasInput("ScaleX"), "Input", "ScaleX", "ResNetUnitOp");
-    OP_INOUT_CHECK(ctx->HasInput("BiasX"), "Input", "BiasX", "ResNetUnitOp");
-    OP_INOUT_CHECK(ctx->HasInput("MeanX"), "Input", "MeanX", "ResNetUnitOp");
-    OP_INOUT_CHECK(ctx->HasInput("VarX"), "Input", "VarX", "ResNetUnitOp");
-
-    bool fuse_add = ctx->Attrs().Get<bool>("fuse_add");
-    bool has_shortcut = ctx->Attrs().Get<bool>("has_shortcut");
-    if (fuse_add || has_shortcut) {
-      OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", "ResNetUnitOp");
-    }
-    if (has_shortcut) {
-      OP_INOUT_CHECK(
-          ctx->HasInput("FilterZ"), "Input", "FilterZ", "ResNetUnitOp");
-      OP_INOUT_CHECK(
-          ctx->HasInput("ScaleZ"), "Input", "ScaleZ", "ResNetUnitOp");
-      OP_INOUT_CHECK(ctx->HasInput("BiasZ"), "Input", "BiasZ", "ResNetUnitOp");
-      OP_INOUT_CHECK(ctx->HasInput("MeanZ"), "Input", "MeanZ", "ResNetUnitOp");
-      OP_INOUT_CHECK(ctx->HasInput("VarZ"), "Input", "VarZ", "ResNetUnitOp");
-    }
-
-    // Check output
-    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "ResNetUnitOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("BitMask"), "Output", "BitMask", "ResNetUnitOp");
-    OP_INOUT_CHECK(ctx->HasOutput("ConvX"), "Output", "ConvX", "ResNetUnitOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("SavedMeanX"), "Output", "SavedMeanX", "ResNetUnitOp");
-    OP_INOUT_CHECK(ctx->HasOutput("SavedInvstdX"),
-                   "Output",
-                   "SavedInvstdX",
-                   "ResNetUnitOp");
-    OP_INOUT_CHECK(ctx->HasOutput("RunningMeanX"),
-                   "Output",
-                   "RunningMeanX",
-                   "ResNetUnitOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("RunningVarX"), "Output", "RunningVarX", "ResNetUnitOp");
-    if (has_shortcut) {
-      OP_INOUT_CHECK(
-          ctx->HasOutput("ConvZ"), "Output", "ConvZ", "ResNetUnitOp");
-      OP_INOUT_CHECK(
-          ctx->HasOutput("SavedMeanZ"), "Output", "SavedMeanZ", "ResNetUnitOp");
-      OP_INOUT_CHECK(ctx->HasOutput("SavedInvstdZ"),
-                     "Output",
-                     "SavedInvstdZ",
-                     "ResNetUnitOp");
-      OP_INOUT_CHECK(ctx->HasOutput("RunningMeanZ"),
-                     "Output",
-                     "RunningMeanZ",
-                     "ResNetUnitOp");
-      OP_INOUT_CHECK(ctx->HasOutput("RunningVarZ"),
-                     "Output",
-                     "RunningVarZ",
-                     "ResNetUnitOp");
-    }
-
-    // make sure Mean/RunningMean and Var/RunningVar share memory
-    PADDLE_ENFORCE_EQ(
-        ctx->Inputs("MeanX")[0],
-        ctx->Outputs("RunningMeanX")[0],
-        phi::errors::InvalidArgument(
-            "MeanX and RunningMeanX should share the same memory"));
-    PADDLE_ENFORCE_EQ(ctx->Inputs("VarX")[0],
-                      ctx->Outputs("RunningVarX")[0],
-                      phi::errors::InvalidArgument(
-                          "VarX and RunningVarX should share the same memory"));
-    if (has_shortcut) {
-      PADDLE_ENFORCE_EQ(
-          ctx->Inputs("MeanZ")[0],
-          ctx->Outputs("RunningMeanZ")[0],
-          phi::errors::InvalidArgument(
-              "MeanZ and RunningMeanZ should share the same memory"));
-      PADDLE_ENFORCE_EQ(
-          ctx->Inputs("VarZ")[0],
-          ctx->Outputs("RunningVarZ")[0],
-          phi::errors::InvalidArgument(
-              "VarZ and RunningVarZ should share the same memory"));
-    }
-
-    // Check dims of inputs
-    const auto x_dims = ctx->GetInputDim("X");
-    const auto w_dims = ctx->GetInputDim("FilterX");
-    std::vector<int64_t> bn_param_shape =
-        common::vectorize(ctx->GetInputDim("ScaleX"));
-    if (1 == bn_param_shape.size()) {
-      bn_param_shape = {1, 1, 1, bn_param_shape[0]};
-    }
-    framework::DDim bn_param_dims = common::make_ddim(bn_param_shape);
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(),
-        4,
-        phi::errors::InvalidArgument("The dimensions of input "
-                                     "must equal to 4."
-                                     "But received: the shape of input "
-                                     "= [%s], the dimension of input = "
-                                     "[%d]",
-                                     x_dims,
-                                     x_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        w_dims.size(),
-        4,
-        phi::errors::InvalidArgument("The dimensions of filter "
-                                     "must equal to 4."
-                                     "But received: the shape of filter "
-                                     "= [%s], the dimension of filter = [%d] ",
-                                     w_dims,
-                                     w_dims.size()));
-    PADDLE_ENFORCE_EQ(bn_param_dims.size(),
-                      4,
-                      phi::errors::InvalidArgument(
-                          "The dimensions of bn param "
-                          "must equal to 4."
-                          "But received: the shape of bn param "
-                          "= [%s], the dimension of bn param = [%d] ",
-                          bn_param_dims,
-                          bn_param_dims.size()));
-    auto data_format = ctx->Attrs().Get<std::string>("data_format");
-    bool is_nchw = (data_format == "NCHW");
-    // Calculate the dims of outputs
-    int batch = x_dims[0];
-    int output_channel = w_dims[0];
-    int filter_size = w_dims[2];
-    int stride = ctx->Attrs().Get<int>("stride");
-    int padding = ctx->Attrs().Get<int>("padding");
-    std::vector<int> out_shape;
-    out_shape.push_back(batch);
-    if (is_nchw) {
-      int out_h = (x_dims[2] + padding * 2 - filter_size) / stride + 1;
-      int out_w = (x_dims[3] + padding * 2 - filter_size) / stride + 1;
-      out_shape.push_back(output_channel);
-      out_shape.push_back(out_h);
-      out_shape.push_back(out_w);
-    } else {
-      int out_h = (x_dims[1] + padding * 2 - filter_size) / stride + 1;
-      int out_w = (x_dims[2] + padding * 2 - filter_size) / stride + 1;
-      out_shape.push_back(out_h);
-      out_shape.push_back(out_w);
-      out_shape.push_back(output_channel);
-    }
-
-    auto y_dims = common::make_ddim(out_shape);
-    auto bitmask_dims = GetBitmaskDims(out_shape);
-    // Set dims of outputs
-    ctx->SetOutputDim("Y", y_dims);
-    ctx->SetOutputDim("BitMask", bitmask_dims);
-    ctx->SetOutputDim("ConvX", y_dims);
-    ctx->SetOutputDim("SavedMeanX", bn_param_dims);
-    ctx->SetOutputDim("SavedInvstdX", bn_param_dims);
-    ctx->SetOutputDim("RunningMeanX", bn_param_dims);
-    ctx->SetOutputDim("RunningVarX", bn_param_dims);
-    if (has_shortcut) {
-      ctx->SetOutputDim("ConvZ", y_dims);
-      ctx->SetOutputDim("SavedMeanZ", bn_param_dims);
-      ctx->SetOutputDim("SavedInvstdZ", bn_param_dims);
-      ctx->SetOutputDim("RunningMeanZ", bn_param_dims);
-      ctx->SetOutputDim("RunningVarZ", bn_param_dims);
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    // By default, the type of the scale, bias, mean,
-    // and var tensors should be float when input tensor's dtype is float16.
-    auto bn_param_type = framework::proto::VarType::FP32;
-
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(
-            ctx.Input<phi::DenseTensor>("ScaleX")->dtype()),
-        phi::errors::InvalidArgument("Scale input should be of float type"));
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(
-            ctx.Input<phi::DenseTensor>("BiasX")->dtype()),
-        phi::errors::InvalidArgument("Bias input should be of float type"));
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-
-class ResNetUnitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input 1 tensor");
-    AddInput("FilterX", "Filter tensor of input 1");
-    AddInput("ScaleX", "Scale tensor of input 1 used in batchnorm");
-    AddInput("BiasX", "Bias tensor of input 1 used in batchnorm");
-    AddInput("MeanX", "Mean tensor of input 1 used in batchnorm");
-    AddInput("VarX", "Variance tensor of input 1 used in batchnorm");
-    AddInput("Z", "The input 2 tensor").AsDispensable();
-    AddInput("FilterZ", "Filter tensor of input 2").AsDispensable();
-    AddInput("ScaleZ", "Scale tensor of input 2").AsDispensable();
-    AddInput("BiasZ", "Bias tensor of input 2").AsDispensable();
-    AddInput("MeanZ", "Mean tensor of input 2").AsDispensable();
-    AddInput("VarZ", "Variance tensor of input 2").AsDispensable();
-    AddOutput("Y", "The result of the resnet unit");
-    AddOutput("BitMask", "The bitmask generated after relu");
-    AddOutput("ConvX", "The output of input 1 after conv");
-    AddOutput("SavedMeanX", "Mean of input 1 in the current batch");
-    AddOutput("SavedInvstdX", "Invstd of input 1 in the current batch");
-    AddOutput("RunningMeanX", "Shared memory with MeanX");
-    AddOutput("RunningVarX", "Shared memory with VarX");
-    AddOutput("ConvZ", "The output of input 2 after conv").AsDispensable();
-    AddOutput("SavedMeanZ", "Mean of input 1 in the current batch")
-        .AsDispensable();
-    AddOutput("SavedInvstdZ", "Invstd of input 1 in the current batch")
-        .AsDispensable();
-    AddOutput("RunningMeanZ", "Shared memory with MeanZ").AsDispensable();
-    AddOutput("RunningVarZ", "Shared memory with VarZ").AsDispensable();
-    AddAttr<int>("stride", "").SetDefault(1);
-    AddAttr<int>("stride_z", "").SetDefault(1);
-    AddAttr<int>("padding", "").SetDefault(0);
-    AddAttr<int>("dilation", "").SetDefault(1);
-    AddAttr<int>("group", "").SetDefault(1);
-    AddAttr<float>("momentum", "").SetDefault(0.9);
-    AddAttr<float>("epsilon", "").SetDefault(1e-5);
-    AddAttr<std::string>("data_format", "").SetDefault("NHWC");
-    AddAttr<bool>("fuse_add", "").SetDefault(false);
-    AddAttr<bool>("has_shortcut", "").SetDefault(false);
-    AddAttr<bool>("use_global_stats", "").SetDefault(false);
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-    AddAttr<bool>("use_addto", "").SetDefault(false);
-    AddAttr<std::string>("act_type", "The activation type to be fused.")
-        .SetDefault("relu");
-    AddComment(R"DOC(
-Fusion op of the basic unit of resnet block.
-
-The implementation is based on the latest fusion op interface in cuDNN v8.0.
-For more details:
-https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnFusedOps_t
-
-)DOC");
-  }
-};
-
-class ResNetUnitGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // check input
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ResNetUnitGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("FilterX"), "Input", "FilterX", "ResNetUnitGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("ConvX"), "Input", "ConvX", "ResNetUnitGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("ScaleX"), "Input", "ScaleX", "ResNetUnitGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("BiasX"), "Input", "BiasX", "ResNetUnitGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("SavedMeanX"), "Input", "SavedMeanX", "ResNetUnitGradOp");
-    OP_INOUT_CHECK(ctx->HasInput("SavedInvstdX"),
-                   "Input",
-                   "SavedInvstdX",
-                   "ResNetUnitGradOp");
-
-    bool fuse_add = ctx->Attrs().Get<bool>("fuse_add");
-    bool has_shortcut = ctx->Attrs().Get<bool>("has_shortcut");
-    if (fuse_add || has_shortcut) {
-      OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", "ResNetUnitGradOp");
-    }
-    if (has_shortcut) {
-      OP_INOUT_CHECK(
-          ctx->HasInput("FilterZ"), "Input", "FilterZ", "ResNetUnitGradOp");
-      OP_INOUT_CHECK(
-          ctx->HasInput("ConvZ"), "Input", "ConvZ", "ResNetUnitGradOp");
-      OP_INOUT_CHECK(
-          ctx->HasInput("ScaleZ"), "Input", "ScaleZ", "ResNetUnitGradOp");
-      OP_INOUT_CHECK(
-          ctx->HasInput("BiasZ"), "Input", "BiasZ", "ResNetUnitGradOp");
-      OP_INOUT_CHECK(ctx->HasInput("SavedMeanZ"),
-                     "Input",
-                     "SavedMeanZ",
-                     "ResNetUnitGradOp");
-      OP_INOUT_CHECK(ctx->HasInput("SavedInvstdZ"),
-                     "Input",
-                     "SavedInvstdZ",
-                     "ResNetUnitGradOp");
-    }
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "ResNetUnitGradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("BitMask"), "Input", "BitMask", "ResNetUnitGradOp");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input",
-                   framework::GradVarName("Y"),
-                   "ResNetUnitGradOp");
-
-    // check output
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   framework::GradVarName("X"),
-                   "ResNetUnitGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("FilterX")),
-                   "Output",
-                   framework::GradVarName("FilterX"),
-                   "ResNetUnitGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("ScaleX")),
-                   "Output",
-                   framework::GradVarName("ScaleX"),
-                   "ResNetUnitGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("BiasX")),
-                   "Output",
-                   framework::GradVarName("BiasX"),
-                   "ResNetUnitGradOp");
-    if (fuse_add) {
-      OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Z")),
-                     "Output",
-                     framework::GradVarName("Z"),
-                     "ResNetUnitGradOp");
-    }
-    if (has_shortcut) {
-      OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("FilterZ")),
-                     "Output",
-                     framework::GradVarName("FilterZ"),
-                     "ResNetUnitGradOp");
-      OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("ScaleZ")),
-                     "Output",
-                     framework::GradVarName("ScaleZ"),
-                     "ResNetUnitGradOp");
-      OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("BiasZ")),
-                     "Output",
-                     framework::GradVarName("BiasZ"),
-                     "ResNetUnitGradOp");
-    }
-    const auto x_dims = ctx->GetInputDim("X");
-    const auto filter_x_dims = ctx->GetInputDim("FilterX");
-    const auto param_dims = ctx->GetInputDim("ScaleX");
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->SetOutputDim(framework::GradVarName("FilterX"), filter_x_dims);
-    ctx->SetOutputDim(framework::GradVarName("ScaleX"), param_dims);
-    ctx->SetOutputDim(framework::GradVarName("BiasX"), param_dims);
-    if (fuse_add || has_shortcut) {
-      const auto z_dims = ctx->GetInputDim("Z");
-      ctx->SetOutputDim(framework::GradVarName("Z"), z_dims);
-    }
-    if (has_shortcut) {
-      const auto filter_z_dims = ctx->GetInputDim("FilterZ");
-      ctx->SetOutputDim(framework::GradVarName("FilterZ"), filter_z_dims);
-      ctx->SetOutputDim(framework::GradVarName("ScaleZ"), param_dims);
-      ctx->SetOutputDim(framework::GradVarName("BiasZ"), param_dims);
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar(framework::GradVarName("Y")),
-        phi::errors::NotFound("Can not find Y@GRAD in the execution context."));
-
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class ResNetUnitGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("resnet_unit_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("FilterX", this->Input("FilterX"));
-    op->SetInput("ConvX", this->Output("ConvX"));
-    op->SetInput("ScaleX", this->Input("ScaleX"));
-    op->SetInput("BiasX", this->Input("BiasX"));
-    op->SetInput("SavedMeanX", this->Output("SavedMeanX"));
-    op->SetInput("SavedInvstdX", this->Output("SavedInvstdX"));
-    op->SetInput("Z", this->Input("Z"));
-    op->SetInput("FilterZ", this->Input("FilterZ"));
-    op->SetInput("ConvZ", this->Output("ConvZ"));
-    op->SetInput("ScaleZ", this->Input("ScaleZ"));
-    op->SetInput("BiasZ", this->Input("BiasZ"));
-    op->SetInput("SavedMeanZ", this->Output("SavedMeanZ"));
-    op->SetInput("SavedInvstdZ", this->Output("SavedInvstdZ"));
-    op->SetInput("Y", this->Output("Y"));
-    op->SetInput("BitMask", this->Output("BitMask"));
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-
-    op->SetAttrMap(this->Attrs());
-
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("FilterX"),
-                  this->InputGrad("FilterX"));
-    op->SetOutput(framework::GradVarName("ScaleX"), this->InputGrad("ScaleX"));
-    op->SetOutput(framework::GradVarName("BiasX"), this->InputGrad("BiasX"));
-    op->SetOutput(framework::GradVarName("Z"), this->InputGrad("Z"));
-    op->SetOutput(framework::GradVarName("FilterZ"),
-                  this->InputGrad("FilterZ"));
-    op->SetOutput(framework::GradVarName("ScaleZ"), this->InputGrad("ScaleZ"));
-    op->SetOutput(framework::GradVarName("BiasZ"), this->InputGrad("BiasZ"));
-  }
-};
-
-class ResNetUnitOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
-      const override {
-    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Y"}};
-    return m;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(resnet_unit,
-                  ops::ResNetUnitOp,
-                  ops::ResNetUnitOpMaker,
-                  ops::ResNetUnitOpInferVarType,
-                  ops::ResNetUnitGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ResNetUnitGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(resnet_unit_grad, ops::ResNetUnitGradOp);
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cu b/paddle/fluid/operators/fused/resnet_unit_op.cu
deleted file mode 100644
index 6afe03a67ceab..0000000000000
--- a/paddle/fluid/operators/fused/resnet_unit_op.cu
+++ /dev/null
@@ -1,429 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h"
-#include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h"
-#include "paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h"
-#include "paddle/phi/common/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class ResNetUnitKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()),
-        true,
-        phi::errors::PreconditionNotMet("It must use CUDAPlace."));
-    PADDLE_ENFORCE_EQ(platform::CudnnDataType<T>::type,
-                      CUDNN_DATA_HALF,
-                      phi::errors::Unavailable(
-                          "ResNetUnitOp only supports float16 for now."));
-
-    // input x
-    const phi::DenseTensor *input_x = ctx.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
-    const phi::DenseTensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
-    const phi::DenseTensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
-    // norm conv
-    phi::DenseTensor *conv_out_x = ctx.Output<phi::DenseTensor>("ConvX");
-    // bn finalize
-    phi::DenseTensor *saved_mean_x = ctx.Output<phi::DenseTensor>("SavedMeanX");
-    phi::DenseTensor *saved_invstd_x =
-        ctx.Output<phi::DenseTensor>("SavedInvstdX");
-    phi::DenseTensor *running_mean_x =
-        ctx.Output<phi::DenseTensor>("RunningMeanX");
-    phi::DenseTensor *running_var_x =
-        ctx.Output<phi::DenseTensor>("RunningVarX");
-    // sbar
-    phi::DenseTensor *output = ctx.Output<phi::DenseTensor>("Y");
-    phi::DenseTensor *bitmask = ctx.Output<phi::DenseTensor>("BitMask");
-    // attrs
-    int padding = ctx.Attr<int>("padding");
-    int stride = ctx.Attr<int>("stride");
-    int stride_z = ctx.Attr<int>("stride_z");
-    int dilation = ctx.Attr<int>("dilation");
-    int group = ctx.Attr<int>("group");
-    double eps = static_cast<double>(ctx.Attr<float>("epsilon"));
-    double momentum = static_cast<double>(ctx.Attr<float>("momentum"));
-    bool has_shortcut = ctx.Attr<bool>("has_shortcut");
-    bool fuse_add = ctx.Attr<bool>("fuse_add");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    bool is_test = ctx.Attr<bool>("is_test");
-    bool is_train = !is_test && !use_global_stats;
-    std::string act_type = ctx.Attr<std::string>("act_type");
-
-    auto input_x_shape = common::vectorize<int>(input_x->dims());
-    auto filter_x_shape = common::vectorize<int>(filter_x->dims());
-    // std::swap used to convert shape of filter from conv2d when kernel size is
-    // 1.
-    if (filter_x_shape[1] != filter_x_shape[2] && 1 == filter_x_shape[2]) {
-      std::swap(filter_x_shape[1], filter_x_shape[3]);
-    }
-    auto param_dims = scale_x->dims();
-    auto param_shape = common::vectorize<int>(scale_x->dims());
-    if (1 == param_shape.size()) {
-      param_shape = {1, 1, 1, param_shape[0]};
-    }
-    auto output_shape = common::vectorize<int>(output->dims());
-    auto bitmask_shape = common::vectorize<int>(bitmask->dims());
-    int output_channel = filter_x_shape[0];
-    int64_t ele_count = std::accumulate(output_shape.begin(),
-                                        output_shape.end(),
-                                        1,
-                                        std::multiplies<int>()) /
-                        output_channel;
-
-    auto place = ctx.GetPlace();
-    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
-
-    // 1. Conv
-    phi::DenseTensor sum_x;
-    phi::DenseTensor sum_of_squares_x;
-    sum_x.Resize(param_dims);
-    sum_of_squares_x.Resize(param_dims);
-    CudnnNormConvolution<T> conv_x_op(dev_ctx,
-                                      input_x_shape,
-                                      filter_x_shape,
-                                      output_shape,
-                                      padding,
-                                      stride,
-                                      dilation,
-                                      group);
-    conv_x_op.Forward(
-        dev_ctx, *input_x, *filter_x, conv_out_x, &sum_x, &sum_of_squares_x);
-
-    // 2. BN
-    phi::DenseTensor equiv_scale_x;
-    phi::DenseTensor equiv_bias_x;
-    equiv_scale_x.Resize(param_dims);
-    equiv_bias_x.Resize(param_dims);
-    CudnnBNStatsFinalize<T> bn_x_op(dev_ctx, param_shape);
-    bn_x_op.Forward(dev_ctx,
-                    sum_x,
-                    sum_of_squares_x,
-                    *scale_x,
-                    *bias_x,
-                    saved_mean_x,
-                    saved_invstd_x,
-                    running_mean_x,
-                    running_var_x,
-                    &equiv_scale_x,
-                    &equiv_bias_x,
-                    eps,
-                    momentum,
-                    ele_count,
-                    is_train);
-
-    // 3. scale + bias + add + relu
-    CudnnScaleBiasAddRelu<T> sbar_op(dev_ctx,
-                                     act_type,
-                                     fuse_add,
-                                     has_shortcut,
-                                     output_shape,
-                                     param_shape,
-                                     bitmask_shape);
-    if (has_shortcut) {
-      // input z
-      const phi::DenseTensor *input_z = ctx.Input<phi::DenseTensor>("Z");
-      const phi::DenseTensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
-      const phi::DenseTensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
-      const phi::DenseTensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
-      // norm conv
-      phi::DenseTensor *conv_out_z = ctx.Output<phi::DenseTensor>("ConvZ");
-      // bn finalize
-      phi::DenseTensor *saved_mean_z =
-          ctx.Output<phi::DenseTensor>("SavedMeanZ");
-      phi::DenseTensor *saved_invstd_z =
-          ctx.Output<phi::DenseTensor>("SavedInvstdZ");
-      phi::DenseTensor *running_mean_z =
-          ctx.Output<phi::DenseTensor>("RunningMeanZ");
-      phi::DenseTensor *running_var_z =
-          ctx.Output<phi::DenseTensor>("RunningVarZ");
-
-      auto input_z_shape = common::vectorize<int>(input_z->dims());
-      auto filter_z_shape = common::vectorize<int>(filter_z->dims());
-
-      // 3.1 Conv for second input
-      phi::DenseTensor sum_z;
-      phi::DenseTensor sum_of_squares_z;
-      sum_z.Resize(param_dims);
-      sum_of_squares_z.Resize(param_dims);
-      CudnnNormConvolution<T> conv_z_op(dev_ctx,
-                                        input_z_shape,
-                                        filter_z_shape,
-                                        output_shape,
-                                        padding,
-                                        stride_z,
-                                        dilation,
-                                        group);
-      conv_z_op.Forward(
-          dev_ctx, *input_z, *filter_z, conv_out_z, &sum_z, &sum_of_squares_z);
-
-      // 3.2 BN for second input
-      phi::DenseTensor equiv_scale_z;
-      phi::DenseTensor equiv_bias_z;
-      equiv_scale_z.Resize(param_dims);
-      equiv_bias_z.Resize(param_dims);
-      CudnnBNStatsFinalize<T> bn_z_op(dev_ctx, param_shape);
-      bn_z_op.Forward(dev_ctx,
-                      sum_z,
-                      sum_of_squares_z,
-                      *scale_z,
-                      *bias_z,
-                      saved_mean_z,
-                      saved_invstd_z,
-                      running_mean_z,
-                      running_var_z,
-                      &equiv_scale_z,
-                      &equiv_bias_z,
-                      eps,
-                      momentum,
-                      ele_count,
-                      is_train);
-      // 3.3 sbar
-      sbar_op.Forward(dev_ctx,
-                      *conv_out_x,
-                      equiv_scale_x,
-                      equiv_bias_x,
-                      conv_out_z,
-                      &equiv_scale_z,
-                      &equiv_bias_z,
-                      output,
-                      bitmask);
-    } else {
-      const phi::DenseTensor *input_z =
-          fuse_add ? ctx.Input<phi::DenseTensor>("Z") : nullptr;
-      sbar_op.Forward(dev_ctx,
-                      *conv_out_x,
-                      equiv_scale_x,
-                      equiv_bias_x,
-                      input_z,
-                      nullptr,
-                      nullptr,
-                      output,
-                      bitmask);
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class ResNetUnitGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()),
-        true,
-        phi::errors::PreconditionNotMet("It must use CUDAPlace."));
-    PADDLE_ENFORCE_EQ(platform::CudnnDataType<T>::type,
-                      CUDNN_DATA_HALF,
-                      phi::errors::Unavailable(
-                          "ResNetUnitOp only supports float16 for now."));
-
-    const phi::DenseTensor *y_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-
-    const phi::DenseTensor *x = ctx.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
-    const phi::DenseTensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
-    const phi::DenseTensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
-    const phi::DenseTensor *saved_mean_x =
-        ctx.Input<phi::DenseTensor>("SavedMeanX");
-    const phi::DenseTensor *saved_invstd_x =
-        ctx.Input<phi::DenseTensor>("SavedInvstdX");
-
-    const phi::DenseTensor *conv_out_x = ctx.Input<phi::DenseTensor>("ConvX");
-    const phi::DenseTensor *output = ctx.Input<phi::DenseTensor>("Y");
-    const phi::DenseTensor *bitmask = ctx.Input<phi::DenseTensor>("BitMask");
-
-    phi::DenseTensor *x_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    phi::DenseTensor *filter_x_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterX"));
-    phi::DenseTensor *scale_x_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleX"));
-    phi::DenseTensor *bias_x_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasX"));
-
-    int padding = ctx.Attr<int>("padding");
-    int stride = ctx.Attr<int>("stride");
-    int stride_z = ctx.Attr<int>("stride_z");
-    int dilation = ctx.Attr<int>("dilation");
-    int group = ctx.Attr<int>("group");
-    double eps = static_cast<double>(ctx.Attr<float>("epsilon"));
-    double momentum = static_cast<double>(ctx.Attr<float>("momentum"));
-    bool has_shortcut = ctx.Attr<bool>("has_shortcut");
-    bool fuse_add = ctx.Attr<bool>("fuse_add");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    std::string act_type = ctx.Attr<std::string>("act_type");
-
-    auto x_shape = common::vectorize<int>(x->dims());
-    auto filter_x_shape = common::vectorize<int>(filter_x->dims());
-    auto param_shape = common::vectorize<int>(scale_x->dims());
-    auto output_shape = common::vectorize<int>(output->dims());
-    auto bitmask_shape = common::vectorize<int>(bitmask->dims());
-
-    auto place = ctx.GetPlace();
-    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
-
-    // 1. Backward of BN (+ Add + Relu) for x, get conv_out_x_grad,
-    // scale_x_grad, bias_x_grad
-    phi::DenseTensor conv_out_x_grad;
-    conv_out_x_grad.Resize(conv_out_x->dims());
-    CudnnScaleBiasAddRelu<T> sbar_x_op(dev_ctx,
-                                       act_type,
-                                       fuse_add,
-                                       has_shortcut,
-                                       output_shape,
-                                       param_shape,
-                                       bitmask_shape);
-    if (has_shortcut) {
-      //       X                   Z
-      //       |                   |
-      //    NormConv            NormConv
-      //       |                   |
-      // BNStatsFinalize    BNStatsFinalize
-      //       \                   /
-      //          ScaleBiasAddRelu
-      //                  |
-      //                  Y
-      const phi::DenseTensor *z = ctx.Input<phi::DenseTensor>("Z");
-      const phi::DenseTensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
-      const phi::DenseTensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
-      const phi::DenseTensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
-      const phi::DenseTensor *saved_mean_z =
-          ctx.Input<phi::DenseTensor>("SavedMeanZ");
-      const phi::DenseTensor *saved_invstd_z =
-          ctx.Input<phi::DenseTensor>("SavedInvstdZ");
-      const phi::DenseTensor *conv_out_z = ctx.Input<phi::DenseTensor>("ConvZ");
-
-      phi::DenseTensor *z_grad =
-          ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"));
-      phi::DenseTensor *filter_z_grad =
-          ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterZ"));
-      phi::DenseTensor *scale_z_grad =
-          ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleZ"));
-      phi::DenseTensor *bias_z_grad =
-          ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasZ"));
-
-      // 1.1 Backward of BN + Add (+ Relu) for x, get conv_out_x_grad,
-      // scale_x_grad, bias_x_grad and z_grad_temp
-      phi::DenseTensor z_grad_temp;
-      z_grad_temp.Resize(conv_out_z->dims());
-      sbar_x_op.Backward(dev_ctx,
-                         *y_grad,
-                         *conv_out_x,
-                         *scale_x,
-                         *bias_x,
-                         *saved_mean_x,
-                         *saved_invstd_x,
-                         bitmask,
-                         &conv_out_x_grad,
-                         &z_grad_temp,
-                         scale_x_grad,
-                         bias_x_grad,
-                         eps);
-
-      // 1.2 bn backward for z, get conv_out_z_grad, dscale_z, dbias_z
-      phi::DenseTensor conv_out_z_grad;
-      conv_out_z_grad.Resize(conv_out_z->dims());
-      CudnnScaleBiasAddRelu<T> sbar_z_op(
-          dev_ctx, "", false, false, output_shape, param_shape, bitmask_shape);
-      sbar_z_op.Backward(dev_ctx,
-                         z_grad_temp,
-                         *conv_out_z,
-                         *scale_z,
-                         *bias_z,
-                         *saved_mean_z,
-                         *saved_invstd_z,
-                         nullptr,
-                         &conv_out_z_grad,
-                         nullptr,
-                         scale_z_grad,
-                         bias_z_grad,
-                         eps);
-
-      // 1.3 Backward of Conv for z, get z_grad and filter_z_grad
-      auto z_shape = common::vectorize<int>(z->dims());
-      auto filter_z_shape = common::vectorize<int>(filter_z->dims());
-      CudnnNormConvolutionGrad<T> conv_z_op(dev_ctx,
-                                            z_shape,
-                                            filter_z_shape,
-                                            output_shape,
-                                            padding,
-                                            stride_z,
-                                            dilation,
-                                            group);
-      conv_z_op.Backward(
-          dev_ctx, *z, *filter_z, conv_out_z_grad, z_grad, filter_z_grad);
-    } else {
-      // 1.1 Backward of BN (+ Add + Relu) for x, get conv_out_x_grad,
-      // scale_x_grad, bias_x_grad (and z_grad)
-      phi::DenseTensor *z_grad =
-          fuse_add ? ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"))
-                   : nullptr;
-      sbar_x_op.Backward(dev_ctx,
-                         *y_grad,
-                         *conv_out_x,
-                         *scale_x,
-                         *bias_x,
-                         *saved_mean_x,
-                         *saved_invstd_x,
-                         bitmask,
-                         &conv_out_x_grad,
-                         z_grad,
-                         scale_x_grad,
-                         bias_x_grad,
-                         eps);
-    }
-
-    // 2. Backward of Conv for x, get x_grad and filter_x_grad
-    bool use_addto = ctx.Attr<bool>("use_addto");
-    CudnnNormConvolutionGrad<T> conv_x_op(dev_ctx,
-                                          x_shape,
-                                          filter_x_shape,
-                                          output_shape,
-                                          padding,
-                                          stride,
-                                          dilation,
-                                          group);
-    conv_x_op.Backward(dev_ctx,
-                       *x,
-                       *filter_x,
-                       conv_out_x_grad,
-                       x_grad,
-                       filter_x_grad,
-                       use_addto);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#if CUDNN_VERSION >= 8000
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-PD_REGISTER_STRUCT_KERNEL(
-    resnet_unit, GPU, ALL_LAYOUT, ops::ResNetUnitKernel, phi::dtype::float16) {}
-PD_REGISTER_STRUCT_KERNEL(resnet_unit_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::ResNetUnitGradKernel,
-                          phi::dtype::float16) {}
-#endif
diff --git a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
deleted file mode 100644
index f50d452d6c285..0000000000000
--- a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
+++ /dev/null
@@ -1,373 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/phi/common/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class ResNetUnitXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place),
-                      true,
-                      phi::errors::PreconditionNotMet("It must use XPUPlace."));
-
-    bool is_nchw = (ctx.Attr<std::string>("data_format") == "NCHW");
-    // input x
-    const phi::DenseTensor *input_x = ctx.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
-    const phi::DenseTensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
-    const phi::DenseTensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
-
-    // output x
-    phi::DenseTensor *conv_out_x = ctx.Output<phi::DenseTensor>("ConvX");
-    phi::DenseTensor *saved_mean_x = ctx.Output<phi::DenseTensor>("SavedMeanX");
-    phi::DenseTensor *saved_invstd_x =
-        ctx.Output<phi::DenseTensor>("SavedInvstdX");
-    phi::DenseTensor *running_mean_x =
-        ctx.Output<phi::DenseTensor>("RunningMeanX");
-    phi::DenseTensor *running_var_x =
-        ctx.Output<phi::DenseTensor>("RunningVarX");
-
-    phi::DenseTensor *output = ctx.Output<phi::DenseTensor>("Y");
-
-    //  attrs
-    int padding = ctx.Attr<int>("padding");
-    int stride = ctx.Attr<int>("stride");
-    int stride_z = ctx.Attr<int>("stride_z");
-    int dilation = ctx.Attr<int>("dilation");
-    int group = ctx.Attr<int>("group");
-    float eps = ctx.Attr<float>("epsilon");
-    float momentum = ctx.Attr<float>("momentum");
-    bool has_shortcut = ctx.Attr<bool>("has_shortcut");
-    bool fuse_add = ctx.Attr<bool>("fuse_add");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    bool is_test = ctx.Attr<bool>("is_test");
-    bool is_train = !is_test && !use_global_stats;
-    std::string act_type = ctx.Attr<std::string>("act_type");
-    auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
-
-    std::vector<const XPUType *> x_list = {
-        reinterpret_cast<const XPUType *>(input_x->data<T>())};
-    std::vector<const XPUType *> w_list = {
-        reinterpret_cast<const XPUType *>(filter_x->data<T>())};
-    std::vector<XPUType *> conv_y_list = {
-        reinterpret_cast<XPUType *>(conv_out_x->mutable_data<T>(place))};
-
-    std::vector<std::vector<int>> x_shape_list = {
-        common::vectorize<int>(input_x->dims())};
-
-    auto filter_x_shape = common::vectorize<int>(filter_x->dims());
-    std::vector<int> ksize = {filter_x_shape[2], filter_x_shape[3]};
-    if (!is_nchw) {
-      ksize[0] = filter_x_shape[1];
-      ksize[1] = filter_x_shape[2];
-    }
-    std::vector<int> strides = {stride, stride};
-    std::vector<std::vector<int>> ksize_list = {ksize};
-    std::vector<std::vector<int>> stride_list = {strides};
-    std::vector<int> paddings = {padding, padding};
-    std::vector<int> dilations = {dilation, dilation};
-    std::vector<const float *> scale_list = {scale_x->data<float>()};
-    std::vector<const float *> bias_list = {bias_x->data<float>()};
-    std::vector<float *> batch_mean_list = {
-        saved_mean_x->mutable_data<float>(place)};
-    std::vector<float *> batch_invstd_list = {
-        saved_invstd_x->mutable_data<float>(place)};
-    std::vector<float *> global_mean_list = {
-        running_mean_x->mutable_data<float>(place)};
-    std::vector<float *> global_var_list = {
-        running_var_x->mutable_data<float>(place)};
-
-    std::vector<const float *> x_maxlist = {nullptr};
-    std::vector<const float *> w_maxlist = {nullptr};
-    if (has_shortcut) {
-      // input z
-      const phi::DenseTensor *input_z = ctx.Input<phi::DenseTensor>("Z");
-      const phi::DenseTensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
-      const phi::DenseTensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
-      const phi::DenseTensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
-
-      phi::DenseTensor *conv_out_z = ctx.Output<phi::DenseTensor>("ConvZ");
-      phi::DenseTensor *saved_mean_z =
-          ctx.Output<phi::DenseTensor>("SavedMeanZ");
-      phi::DenseTensor *saved_invstd_z =
-          ctx.Output<phi::DenseTensor>("SavedInvstdZ");
-      phi::DenseTensor *running_mean_z =
-          ctx.Output<phi::DenseTensor>("RunningMeanZ");
-      phi::DenseTensor *running_var_z =
-          ctx.Output<phi::DenseTensor>("RunningVarZ");
-
-      x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
-      w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
-      conv_y_list.push_back(
-          reinterpret_cast<XPUType *>(conv_out_z->mutable_data<T>(place)));
-
-      x_shape_list.push_back(common::vectorize<int>(input_z->dims()));
-
-      auto filter_z_shape = common::vectorize<int>(filter_z->dims());
-      std::vector<int> ksize_z = {filter_z_shape[2], filter_z_shape[3]};
-      if (!is_nchw) {
-        ksize_z[0] = filter_z_shape[1];
-        ksize_z[1] = filter_z_shape[2];
-      }
-      ksize_list.push_back(ksize_z);
-      stride_list.push_back({stride_z, stride_z});
-      scale_list.push_back(scale_z->data<float>());
-      bias_list.push_back(bias_z->data<float>());
-      batch_mean_list.push_back(saved_mean_z->mutable_data<float>(place));
-      batch_invstd_list.push_back(saved_invstd_z->mutable_data<float>(place));
-      global_mean_list.push_back(running_mean_z->mutable_data<float>(place));
-      global_var_list.push_back(running_var_z->mutable_data<float>(place));
-      x_maxlist.push_back(nullptr);
-      w_maxlist.push_back(nullptr);
-    } else {
-      if (fuse_add) {
-        const phi::DenseTensor *input_z = ctx.Input<phi::DenseTensor>("Z");
-        auto input_z_shape = common::vectorize<int>(input_z->dims());
-        x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
-        x_shape_list.push_back(input_z_shape);
-        x_maxlist.push_back(nullptr);
-      }
-    }
-    int r = xpu::resnet_unit_fusion<XPUType, XPUType, XPUType, int16_t>(
-        dev_ctx.x_context(),
-        x_list,
-        w_list,
-        conv_y_list,
-        reinterpret_cast<XPUType *>(output->mutable_data<T>(place)),
-        x_shape_list,
-        filter_x_shape[0],
-        ksize_list,
-        stride_list,
-        paddings,
-        dilations,
-        group,
-        eps,
-        momentum,
-        x_maxlist,
-        w_maxlist,
-        scale_list,
-        bias_list,
-        batch_mean_list,
-        batch_invstd_list,
-        global_mean_list,
-        global_var_list,
-        xpu::Activation_t::RELU,
-        is_nchw,
-        has_shortcut,
-        fuse_add,
-        is_train);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "resnet_unit_fusion");
-  }
-};
-
-template <typename T, typename DeviceContext>
-class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place),
-                      true,
-                      phi::errors::PreconditionNotMet("It must use XPUPlace."));
-
-    bool is_nchw = (ctx.Attr<std::string>("data_format") == "NCHW");
-    const phi::DenseTensor *y_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const phi::DenseTensor *x = ctx.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
-    const phi::DenseTensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
-    const phi::DenseTensor *saved_mean_x =
-        ctx.Input<phi::DenseTensor>("SavedMeanX");
-    const phi::DenseTensor *saved_invstd_x =
-        ctx.Input<phi::DenseTensor>("SavedInvstdX");
-    const phi::DenseTensor *conv_out_x = ctx.Input<phi::DenseTensor>("ConvX");
-    const phi::DenseTensor *output = ctx.Input<phi::DenseTensor>("Y");
-
-    phi::DenseTensor *x_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    phi::DenseTensor *filter_x_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterX"));
-    phi::DenseTensor *scale_x_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleX"));
-    phi::DenseTensor *bias_x_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasX"));
-
-    int padding = ctx.Attr<int>("padding");
-    int stride = ctx.Attr<int>("stride");
-    int stride_z = ctx.Attr<int>("stride_z");
-    int dilation = ctx.Attr<int>("dilation");
-    int group = ctx.Attr<int>("group");
-    float eps = ctx.Attr<float>("epsilon");
-    bool has_shortcut = ctx.Attr<bool>("has_shortcut");
-    bool fuse_add = ctx.Attr<bool>("fuse_add");
-    std::string act_type = ctx.Attr<std::string>("act_type");
-
-    auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
-
-    std::vector<const XPUType *> x_list = {
-        reinterpret_cast<const XPUType *>(x->data<T>())};
-    std::vector<const XPUType *> w_list = {
-        reinterpret_cast<const XPUType *>(filter_x->data<T>())};
-    std::vector<const XPUType *> conv_y_list = {
-        reinterpret_cast<const XPUType *>(conv_out_x->data<T>())};
-    std::vector<XPUType *> dx_list = {
-        reinterpret_cast<XPUType *>(x_grad->mutable_data<T>(place))};
-    std::vector<XPUType *> dw_list = {
-        reinterpret_cast<XPUType *>(filter_x_grad->mutable_data<T>(place))};
-
-    std::vector<std::vector<int>> x_shape_list = {
-        common::vectorize<int>(x->dims())};
-
-    auto filter_x_shape = common::vectorize<int>(filter_x->dims());
-    std::vector<int> x_ksize = {filter_x_shape[2], filter_x_shape[3]};
-    if (!is_nchw) {
-      x_ksize[0] = filter_x_shape[1];
-      x_ksize[1] = filter_x_shape[2];
-    }
-    std::vector<std::vector<int>> ksize_list = {x_ksize};
-    std::vector<std::vector<int>> stride_list = {{stride, stride}};
-    std::vector<int> paddings = {padding, padding};
-    std::vector<int> dilations = {dilation, dilation};
-
-    std::vector<const float *> x_maxlist = {nullptr};
-    std::vector<const float *> w_maxlist = {nullptr};
-
-    std::vector<const float *> scale_list = {scale_x->data<float>()};
-    std::vector<const float *> batch_mean_list = {saved_mean_x->data<float>()};
-    std::vector<const float *> batch_invstd_list = {
-        saved_invstd_x->data<float>()};
-    std::vector<float *> dscale_list = {
-        scale_x_grad->mutable_data<float>(place)};
-    std::vector<float *> dbias_list = {bias_x_grad->mutable_data<float>(place)};
-
-    if (has_shortcut) {
-      //       X                   Z
-      //       |                   |
-      //    NormConv            NormConv
-      //       |                   |
-      // BNStatsFinalize    BNStatsFinalize
-      //       \                   /
-      //          ScaleBiasAddRelu
-      //                  |
-      //                  Y
-      const phi::DenseTensor *z = ctx.Input<phi::DenseTensor>("Z");
-      const phi::DenseTensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
-      const phi::DenseTensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
-      const phi::DenseTensor *saved_mean_z =
-          ctx.Input<phi::DenseTensor>("SavedMeanZ");
-      const phi::DenseTensor *saved_invstd_z =
-          ctx.Input<phi::DenseTensor>("SavedInvstdZ");
-      const phi::DenseTensor *conv_out_z = ctx.Input<phi::DenseTensor>("ConvZ");
-
-      phi::DenseTensor *z_grad =
-          ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"));
-      phi::DenseTensor *filter_z_grad =
-          ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterZ"));
-      phi::DenseTensor *scale_z_grad =
-          ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleZ"));
-      phi::DenseTensor *bias_z_grad =
-          ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasZ"));
-      x_list.push_back(reinterpret_cast<const XPUType *>(z->data<T>()));
-      w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
-      conv_y_list.push_back(
-          reinterpret_cast<const XPUType *>(conv_out_z->data<T>()));
-      dx_list.push_back(
-          reinterpret_cast<XPUType *>(z_grad->mutable_data<T>(place)));
-      dw_list.push_back(
-          reinterpret_cast<XPUType *>(filter_z_grad->mutable_data<T>(place)));
-      x_shape_list.push_back(common::vectorize<int>(z->dims()));
-
-      auto filter_z_shape = common::vectorize<int>(filter_z->dims());
-      std::vector<int> ksize_z = {filter_z_shape[2], filter_z_shape[3]};
-      if (!is_nchw) {
-        ksize_z[0] = filter_z_shape[1];
-        ksize_z[1] = filter_z_shape[2];
-      }
-      ksize_list.push_back(ksize_z);
-      stride_list.push_back({stride_z, stride_z});
-      x_maxlist.push_back(nullptr);
-      w_maxlist.push_back(nullptr);
-
-      scale_list.push_back(scale_z->data<float>());
-      batch_mean_list.push_back(saved_mean_z->data<float>());
-      batch_invstd_list.push_back(saved_invstd_z->data<float>());
-      dscale_list.push_back(scale_z_grad->mutable_data<float>(place));
-      dbias_list.push_back(bias_z_grad->mutable_data<float>(place));
-    } else {
-      if (fuse_add) {
-        auto z_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"));
-        dx_list.push_back(
-            reinterpret_cast<XPUType *>(z_grad->mutable_data<T>(place)));
-      }
-    }
-
-    int r = xpu::resnet_unit_grad_fusion<XPUType, XPUType, XPUType, int16_t>(
-        dev_ctx.x_context(),
-        x_list,
-        w_list,
-        reinterpret_cast<const XPUType *>(y_grad->data<T>()),
-        reinterpret_cast<const XPUType *>(output->data<T>()),
-        conv_y_list,
-        dx_list,
-        dw_list,
-        x_shape_list,
-        filter_x_shape[0],
-        ksize_list,
-        stride_list,
-        paddings,
-        dilations,
-        group,
-        x_maxlist,
-        w_maxlist,
-        scale_list,
-        batch_mean_list,
-        batch_invstd_list,
-        dscale_list,
-        dbias_list,
-        xpu::Activation_t::RELU,
-        eps,
-        is_nchw,
-        has_shortcut,
-        fuse_add);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "resnet_unit_grad_fusion");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-PD_REGISTER_STRUCT_KERNEL(resnet_unit,
-                          XPU,
-                          ALL_LAYOUT,
-                          ops::ResNetUnitXPUKernel,
-                          phi::dtype::float16,
-                          float) {}
-PD_REGISTER_STRUCT_KERNEL(resnet_unit_grad,
-                          XPU,
-                          ALL_LAYOUT,
-                          ops::ResNetUnitGradXPUKernel,
-                          phi::dtype::float16,
-                          float) {}
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
deleted file mode 100644
index a27863819fedd..0000000000000
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/linear_chain_crf_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Emission",
-             "(phi::DenseTensor<float>). When a phi::DenseTensor "
-             "input,A 2-D phi::DenseTensor"
-             " with shape [N x D], where N is the size of the "
-             "mini-batch and D is the total tag number. The unscaled emission "
-             "weight matrix for the linear chain CRF. When a Tensor input,"
-             "A Tensor with shape [N x S x D], where N is batch number,"
-             "S is max length of sequences, D is the total tag number."
-             "A phi::DenseTensor with type float32, float64.");
-    AddInput("Transition",
-             "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
-             "[(D + 2) x D]. The learnable parameter for the linear_chain_crf "
-             "operator. See more details in the operator's comments.");
-    AddInput("Label",
-             "(phi::DenseTensor<int64_t>), when a phi::DenseTensor input,  "
-             "[N x 1], where N is the total element number in a mini-batch. "
-             "when a Tensor input, [N x S], where N is batch number. "
-             "S is max length of sequences. The ground truth."
-             "A  phi::DenseTensor with int64.");
-    AddInput("Length",
-             "(Tensor, default Tensor<int64_t>) A Tensor with shape "
-             "[M x 1], where M is the sequence number in a mini-batch."
-             "A Tensor with type int64.")
-        .AsDispensable();
-    AddOutput(
-        "Alpha",
-        "(Tensor, default Tensor<float>), the same shape with Emission. "
-        "The forward vectors for the entire batch. Denote it as $\alpha$. "
-        "$\alpha$ is a memo table used to calculate the normalization "
-        "factor in CRF. $\alpha[k, v]$ stores the unnormalized "
-        "probabilities of all possible unfinished sequences of tags that end "
-        "at position $k$ with tag $v$. For each $k$, "
-        "$\alpha[k, v]$ is a vector of length $D$ with a component for "
-        "each tag value $v$. This vector is called a forward vector and "
-        "will also be used in backward computations.")
-        .AsIntermediate();
-    AddOutput(
-        "EmissionExps",
-        "(Tensor, default Tensor<float>), the same shape with Emission. "
-        "The exponentials of Input(Emission). This is an intermediate "
-        "computational result in forward computation, and will be reused in "
-        "backward computation."
-        "A phi::DenseTensor with type float32, float64.")
-        .AsIntermediate();
-    AddOutput(
-        "TransitionExps",
-        "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
-        "[(D + 2) x D]. The exponentials of Input(Transition). This is an "
-        "intermediate computational result in forward computation, and "
-        "will be reused in backward computation."
-        "A phi::DenseTensor with type float32, float64.")
-        .AsIntermediate();
-    AddOutput(
-        "LogLikelihood",
-        "(Tensor, default Tensor<float>) The logarithm of the conditional "
-        "likelihood of each training sample in a mini-batch. This is a 2-D "
-        "tensor with shape [S x 1], where S is the sequence number in a "
-        "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
-        "A Tensor with type float32, float64.");
-    AddComment(R"DOC(
-Conditional Random Field defines an undirected probabilistic graph with nodes
-denoting random variables and edges denoting dependencies between these
-variables. CRF learns the conditional probability $P(Y|X)$, where
-$X = (x_1, x_2, ... , x_n)$ are structured inputs and
-$Y = (y_1, y_2, ... , y_n)$ are labels for the inputs.
-
-Linear chain CRF is a special case of CRF that is useful for sequence labeling
-task. Sequence labeling tasks do not assume a lot of conditional
-independences among inputs. The only constraint they impose is that the input
-and output must be linear sequences. Thus, the graph of such a CRF is a simple
-chain or a line, which results in the linear chain CRF.
-
-This operator implements the Forward-Backward algorithm for the linear chain
-CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
-http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
-
-Equation:
-
-1. Denote Input(Emission) to this operator as $x$ here.
-2. The first D values of Input(Transition) to this operator are for starting
-weights, denoted as $a$ here.
-3. The next D values of Input(Transition) of this operator are for ending
-weights, denoted as $b$ here.
-4. The remaining values of Input(Transition) are for transition weights,
-denoted as $w$ here.
-5. Denote Input(Label) as $s$ here.
-
-The probability of a sequence $s$ of length $L$ is defined as:
-$$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
-                + \sum_{l=1}^L x_{s_l}
-                + \sum_{l=2}^L w_{s_{l-1},s_l})$$
-
-where $Z$ is a normalization value so that the sum of $P(s)$ over
-all possible sequences is 1, and $x$ is the emission feature weight
-to the linear chain CRF.
-
-Finally, the linear chain CRF operator outputs the logarithm of the conditional
-likelihood of each training sample in a mini-batch.
-
-NOTE:
-
-1. The feature function for a CRF is made up of the emission features and the
-transition features. The emission feature weights are NOT computed in
-this operator. They MUST be computed first before this operator is called.
-
-2. Because this operator performs global normalization over all possible
-sequences internally, it expects UNSCALED emission feature weights.
-Please do not call this op with the emission feature being output of any
-nonlinear activation.
-
-3. The 2nd dimension of Input(Emission) MUST be equal to the tag number.
-
-)DOC");
-  }
-};
-
-class LinearChainCRFOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Emission"), "Input", "Emission", "LinearChainCRF");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Transition"), "Input", "Transition", "LinearChainCRF");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "LinearChainCRF");
-
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Alpha"), "Output", "Alpha", "LinearChainCRF");
-    OP_INOUT_CHECK(ctx->HasOutput("EmissionExps"),
-                   "Output",
-                   "EmissionExps",
-                   "LinearChainCRF");
-    OP_INOUT_CHECK(ctx->HasOutput("TransitionExps"),
-                   "Output",
-                   "TransitionExps",
-                   "LinearChainCRF");
-    OP_INOUT_CHECK(ctx->HasOutput("LogLikelihood"),
-                   "Output",
-                   "LogLikelihood",
-                   "LinearChainCRF");
-
-    auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(transition_dims.size(),
-                      2UL,
-                      phi::errors::InvalidArgument(
-                          "The Input(Transition) should be a 2-D tensor. But "
-                          "received: input rank %u, input shape [%s].",
-                          transition_dims.size(),
-                          transition_dims));
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (transition_dims[0] <= 0 || transition_dims[1] <= 0)) {
-      check = false;
-    }
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          transition_dims[0] - 2,
-          transition_dims[1],
-          phi::errors::InvalidArgument(
-              "An invalid dimension for the Input(Transition), which should "
-              "be a 2-D tensor with shape [(D + 2) x D]. But received: input "
-              "rank %u, "
-              "input shape [%s].",
-              transition_dims.size(),
-              transition_dims));
-    }
-    auto emission_dims = ctx->GetInputDim("Emission");
-    if (ctx->HasInput("Length")) {
-      PADDLE_ENFORCE_EQ(emission_dims.size(),
-                        3,
-                        phi::errors::InvalidArgument(
-                            "The Input(Emission) should be a 3-D tensor. But "
-                            "received: input rank %u, input shape [%s].",
-                            emission_dims.size(),
-                            emission_dims));
-      auto label_dims = ctx->GetInputDim("Label");
-      PADDLE_ENFORCE_EQ(
-          (label_dims.size() == 3UL && label_dims[2] == 1) ||
-              (label_dims.size() == 2UL),
-          true,
-          phi::errors::InvalidArgument(
-              "The Input(Label) should be a 3-D tensor with last dimension "
-              "fixed to 1 or a 2-D tensor in padding mode. But received: input "
-              "rank %u, input shape [%s].",
-              label_dims.size(),
-              label_dims));
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(emission_dims[0],
-                          label_dims[0],
-                          phi::errors::InvalidArgument(
-                              "The batch size of Input(Emission) "
-                              "and Input(Label) should be the same. But "
-                              "received Input(Emission): "
-                              "rank %u, shape [%s]; received Input(Label): "
-                              "rank %u, shape [%s].",
-                              emission_dims.size(),
-                              emission_dims,
-                              label_dims.size(),
-                              label_dims));
-        PADDLE_ENFORCE_EQ(emission_dims[1],
-                          label_dims[1],
-                          phi::errors::InvalidArgument(
-                              "The max length of Input(Emission) "
-                              "and Input(Label) should be the same. But "
-                              "received Input(Emission): "
-                              "rank %u, shape [%s]; received Input(Label): "
-                              "rank %u, shape [%s].",
-                              emission_dims.size(),
-                              emission_dims,
-                              label_dims.size(),
-                              label_dims));
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(
-          emission_dims.size(),
-          2,
-          phi::errors::InvalidArgument(
-              "The Input(Emission) should be a 2-D tensor. But received: "
-              "input rank %u, input shape [%s].",
-              emission_dims.size(),
-              emission_dims));
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(emission_dims[1],
-                          transition_dims[1],
-                          phi::errors::InvalidArgument(
-                              "The 2nd dimension of the Input(Emission) and "
-                              "the Input(Transition) "
-                              "should be equal to the tag number. But received "
-                              "Input(Emission): rank "
-                              "%u, shape [%s]; received Input(Transition): "
-                              "rank %u, shape [%s].",
-                              emission_dims.size(),
-                              emission_dims,
-                              transition_dims.size(),
-                              transition_dims));
-      }
-
-      auto label_dims = ctx->GetInputDim("Label");
-      PADDLE_ENFORCE_EQ(
-          label_dims.size(),
-          2,
-          phi::errors::InvalidArgument(
-              "The Input(Label) should be a 2-D tensor with the 2nd "
-              "dimensions fixed to 1. But received: input rank %u, "
-              "input shape [%s].",
-              label_dims.size(),
-              label_dims));
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(
-            emission_dims[0],
-            label_dims[0],
-            phi::errors::InvalidArgument(
-                "The first dimension of Input(Emission) and Input(Label) "
-                "should be the same. But received Input(Emission): rank %u, "
-                "shape "
-                "[%s]; received Input(Label): rank %u, shape [%s].",
-                emission_dims.size(),
-                emission_dims,
-                label_dims.size(),
-                label_dims));
-      }
-    }
-    ctx->SetOutputDim("Alpha", emission_dims);
-    ctx->SetOutputDim("EmissionExps", emission_dims);
-    ctx->SetOutputDim("TransitionExps", transition_dims);
-    // TODO(caoying) This is tricky. The 1st dimension of Output(LogLikelihood)
-    // is the sequence number in a mini-batch. The dimension set here should be
-    // resized to its correct size in the function Compute. Fix this once we can
-    // get LoD information in the InferShape interface.
-    ctx->SetOutputDim("LogLikelihood", {emission_dims[0], 1});
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of linear_chain_crf
-  // is determined by its input "Emission".
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Emission"),
-        platform::CPUPlace());
-  }
-};
-
-class LinearChainCRFGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("EmissionExps"),
-                   "Input",
-                   "EmissionExps",
-                   "LinearChainCRFGrad");
-    OP_INOUT_CHECK(ctx->HasInput("TransitionExps"),
-                   "Input",
-                   "TransitionExps",
-                   "LinearChainCRFGrad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("LogLikelihood")),
-                   "Input",
-                   framework::GradVarName("LogLikelihood"),
-                   "LinearChainCRFGrad");
-
-    auto transition_exps_dims = ctx->GetInputDim("TransitionExps");
-    auto emission_exps_dims = ctx->GetInputDim("EmissionExps");
-    if (ctx->HasOutput(framework::GradVarName("Emission"))) {
-      ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims);
-      if (ctx->HasInput("Length") == false) {
-        ctx->ShareLoD("Emission", framework::GradVarName("Emission"));
-      }
-    }
-
-    if (ctx->HasOutput(framework::GradVarName("Transition"))) {
-      ctx->SetOutputDim(framework::GradVarName("Transition"),
-                        transition_exps_dims);
-      ctx->ShareLoD("Transition", framework::GradVarName("Transition"));
-    }
-  }
-
- protected:
-  // Explicitly set that the data type of output of the linear_chain_crf_grad
-  // operator is determined by its input: gradients of LogLikelihood.
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("LogLikelihood")),
-                          platform::CPUPlace());
-  }
-};
-
-template <typename T>
-class LinearChainCRFGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("linear_chain_crf_grad");
-    op->SetAttrMap(this->Attrs());
-    op->SetInput("Emission", this->Input("Emission"));
-    op->SetInput("Transition", this->Input("Transition"));
-    op->SetInput("Label", this->Input("Label"));
-    op->SetInput("Alpha", this->Output("Alpha"));
-    op->SetInput("EmissionExps", this->Output("EmissionExps"));
-    op->SetInput("TransitionExps", this->Output("TransitionExps"));
-    if (this->HasInput("Length")) {
-      op->SetInput("Length", this->Input("Length"));
-    }
-    op->SetInput(framework::GradVarName("LogLikelihood"),
-                 this->OutputGrad("LogLikelihood"));
-
-    op->SetOutput(framework::GradVarName("Emission"),
-                  this->InputGrad("Emission"));
-    op->SetOutput(framework::GradVarName("Transition"),
-                  this->InputGrad("Transition"));
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(LinearChainCRFGradNoNeedBufferVarsInferer,
-                                    "Transition",
-                                    "Emission");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(linear_chain_crf,
-                  ops::LinearChainCRFOp,
-                  ops::LinearChainCRFOpMaker,
-                  ops::LinearChainCRFGradMaker<paddle::framework::OpDesc>,
-                  ops::LinearChainCRFGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(linear_chain_crf_grad,
-                  ops::LinearChainCRFGradOp,
-                  ops::LinearChainCRFGradNoNeedBufferVarsInferer);
-
-PD_REGISTER_STRUCT_KERNEL(linear_chain_crf,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::LinearChainCRFOpKernel,
-                          float,
-                          double) {}
-PD_REGISTER_STRUCT_KERNEL(linear_chain_crf_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::LinearChainCRFGradOpKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
deleted file mode 100644
index 01ed8463701e7..0000000000000
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ /dev/null
@@ -1,457 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static inline T NormalizeL1(T* x, size_t len) {
-  T sum = 0.;
-  for (size_t i = 0; i < len; ++i) sum += x[i];
-  // (This comment is from the old LinearChainCRFLayer.)
-  // Right now, we just bet that sum won't be zero. If this really happens, we
-  // will figure out what should be done then.
-  PADDLE_ENFORCE_GT(
-      sum,
-      0.,
-      phi::errors::InvalidArgument(
-          "The unnormalized probabilities of all possible unfinished "
-          "sequences must be greater than 0."));
-  T s = 1. / sum;
-  for (size_t i = 0; i < len; ++i) x[i] *= s;
-  return sum;
-}
-
-template <typename T>
-struct ScalarMul {
-  explicit ScalarMul(const T& scalar) : scalar(scalar) {}
-  T operator()(const T& val) const { return val * scalar; }
-
-  T scalar;
-};
-
-using framework::LoD;
-
-template <typename T, typename DeviceContext>
-class LinearChainCRFOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* emission_weights =
-        ctx.Input<phi::DenseTensor>("Emission");
-    const phi::DenseTensor* transition_weights =
-        ctx.Input<phi::DenseTensor>("Transition");
-
-    phi::DenseTensor* emission_exps =
-        ctx.Output<phi::DenseTensor>("EmissionExps");
-    phi::DenseTensor* transition_exps =
-        ctx.Output<phi::DenseTensor>("TransitionExps");
-    phi::DenseTensor* alpha = ctx.Output<phi::DenseTensor>("Alpha");
-    phi::DenseTensor* ll = ctx.Output<phi::DenseTensor>("LogLikelihood");
-
-    // Because the computation codes only runs on CPU, here the memory for all
-    // the outputs is FIXED to be allocated on the CPU memory.
-    emission_exps->mutable_data<T>(platform::CPUPlace());
-    alpha->mutable_data<T>(platform::CPUPlace());
-    transition_exps->mutable_data<T>(platform::CPUPlace());
-    auto emission_dims = emission_weights->dims();
-
-    const phi::DenseTensor* label = ctx.Input<phi::DenseTensor>("Label");
-    phi::DenseTensor emission_weights_tmp = *emission_weights;
-    phi::DenseTensor label_tmp = *label;
-    phi::DenseTensor emission_exps_tmp = *emission_exps;
-    phi::DenseTensor alpha_tmp = *alpha;
-    int64_t seq_num = 0;
-    int64_t batch_size;
-    int64_t tag_num;
-    const int64_t* length_data = nullptr;
-    framework::LoD in_lod;
-    if (ctx.HasInput("Length")) {
-      const phi::DenseTensor* label_length =
-          ctx.Input<phi::DenseTensor>("Length");
-      length_data = label_length->data<int64_t>();
-      seq_num = label_length->numel();
-      PADDLE_ENFORCE_EQ(
-          seq_num,
-          emission_dims[0],
-          phi::errors::InvalidArgument(
-              "the size of Input(length) must be equal to "
-              "emission_dims[0]. But input_size = %d, emission_dims[0] = %d.",
-              seq_num,
-              emission_dims[0]));
-      auto label_dims = label->dims();
-      PADDLE_ENFORCE_EQ(
-          seq_num,
-          label_dims[0],
-          phi::errors::InvalidArgument(
-              "the size of Input(length) must be equal to "
-              "label_dims[0]. But input_size = %d, label_dims[0] = %d.",
-              seq_num,
-              label_dims[0]));
-
-      batch_size = emission_dims[0] * emission_dims[1];
-      tag_num = emission_dims[2];
-      emission_weights_tmp.Resize({batch_size, tag_num});
-      label_tmp.Resize({batch_size, 1});
-      alpha_tmp.Resize({batch_size, tag_num});
-      emission_exps_tmp.Resize({batch_size, tag_num});
-      phi::funcs::set_constant(
-          ctx.device_context(), emission_exps, static_cast<T>(0.0));
-      phi::funcs::set_constant(
-          ctx.device_context(), alpha, static_cast<T>(0.0));
-    } else {
-      in_lod = ctx.Input<phi::DenseTensor>("Label")->lod();
-      PADDLE_ENFORCE_NE(
-          in_lod.size(),
-          0,
-          phi::errors::InvalidArgument("Input(Label) must be a sequence."));
-      seq_num = in_lod[0].size() - 1;
-      batch_size = emission_dims[0];
-      tag_num = emission_dims[1];
-    }
-
-    // Resize the output tensor to its correct dimension.
-    ll->Resize({seq_num, 1});
-    ll->mutable_data<T>(platform::CPUPlace());
-    // Now, all the inputs and outputs should be on the CPU memory.
-    phi::DenseTensor emission_row_max;
-    emission_row_max.mutable_data<T>(
-        common::make_ddim({static_cast<int64_t>(batch_size), 1}),
-        platform::CPUPlace());
-    auto& place =
-        *ctx.template device_context<phi::CPUContext>().eigen_device();
-    auto x = framework::EigenMatrix<T>::From(emission_weights_tmp);
-    auto x_row_max = framework::EigenMatrix<T>::From(emission_row_max);
-    x_row_max.device(place) =
-        x.maximum(Eigen::DSizes<int, 1>(1))
-            .reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1));
-    auto x_exps = framework::EigenMatrix<T>::From(emission_exps_tmp);
-    x_exps.device(place) =
-        (x - x_row_max.broadcast(Eigen::DSizes<int, 2>(1, tag_num))).exp();
-    auto w = framework::EigenMatrix<T>::From(*transition_weights);
-    auto w_exps = framework::EigenMatrix<T>::From(*transition_exps);
-    w_exps.device(place) = w.exp();
-    T* log_likelihood = ll->data<T>();
-    for (int64_t i = 0; i < seq_num; ++i) {
-      int64_t start_pos = 0;
-      int64_t end_pos = 0;
-      if (ctx.HasInput("Length")) {
-        start_pos = i * emission_dims[1];
-        end_pos = start_pos + length_data[i];
-      } else {
-        start_pos = static_cast<int64_t>(in_lod[0][i]);
-        end_pos = static_cast<int64_t>(in_lod[0][i + 1]);
-      }
-      if (end_pos == start_pos) {
-        // If an empty input sequence is given, pad 0 for its cost.
-        log_likelihood[i] = 0.;
-        continue;
-      }
-      const phi::DenseTensor one_seq =
-          emission_weights_tmp.Slice(start_pos, end_pos);
-      phi::DenseTensor one_seq_row_max =
-          emission_row_max.Slice(start_pos, end_pos);
-      phi::DenseTensor one_seq_exps =
-          emission_exps_tmp.Slice(start_pos, end_pos);
-      const phi::DenseTensor one_seq_label =
-          label_tmp.Slice(start_pos, end_pos);
-      phi::DenseTensor one_seq_alpha = alpha_tmp.Slice(start_pos, end_pos);
-      log_likelihood[i] = ForwardOneSequence(one_seq,
-                                             one_seq_row_max,
-                                             one_seq_exps,
-                                             *transition_weights,
-                                             *transition_exps,
-                                             one_seq_label,
-                                             &one_seq_alpha);
-    }
-  };
-
- private:
-  T ForwardOneSequence(const phi::DenseTensor& emission,
-                       const phi::DenseTensor& emission_row_max,
-                       const phi::DenseTensor& emission_exps,
-                       const phi::DenseTensor& trans_weights,
-                       const phi::DenseTensor& trans_weight_exps,
-                       const phi::DenseTensor& label,
-                       phi::DenseTensor* alpha) const {
-    const T* x = emission.data<T>();
-    const T* x_row_max = emission_row_max.data<T>();
-    const T* x_exps = emission_exps.data<T>();
-    const T* w = trans_weights.data<T>();
-    const T* w_exps = trans_weight_exps.data<T>();
-    T* alpha_value = alpha->data<T>();
-
-    auto x_dims = emission.dims();
-    const size_t seq_length = x_dims[0];
-    const size_t tag_num = x_dims[1];
-    // The 1st row of w are transition weights for start mask.
-    // The 2nd row of w are transition weights for end mask.
-    // Transition weights between other tags begin from the 3rd row of w.
-    const size_t state_trans_base_idx = 2;
-
-    for (size_t i = 0; i < tag_num; ++i) {
-      alpha_value[i] = w_exps[i] * x_exps[i];
-    }
-    T ll = -x_row_max[0] - std::log(NormalizeL1<T>(alpha_value, tag_num));
-
-    for (size_t k = 1; k < seq_length; ++k) {
-      for (size_t i = 0; i < tag_num; ++i) {
-        T sum = 0.;
-        for (size_t j = 0; j < tag_num; ++j) {
-          sum += alpha_value[(k - 1) * tag_num + j] *  // (*)
-                 w_exps[(j + state_trans_base_idx) * tag_num + i];
-        }
-        alpha_value[k * tag_num + i] = x_exps[k * tag_num + i] * sum;
-      }
-      // NormalizeL1 is to avoid underflow or overflow at (*).
-      ll -= x_row_max[k] +
-            std::log(NormalizeL1<T>(alpha_value + k * tag_num, tag_num));
-    }
-    T sum = 0.;
-    for (size_t i = 0; i < tag_num; ++i) {
-      sum += alpha_value[(seq_length - 1) * tag_num + i] * w_exps[tag_num + i];
-    }
-    ll -= std::log(sum);
-    // Now ll is equal to -log(Z).
-
-    const int64_t* lbl = label.data<int64_t>();
-    PADDLE_ENFORCE_LT(
-        static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)),
-        tag_num,
-        phi::errors::InvalidArgument(
-            "An invalid tag label that excesses the largest tag number."));
-
-    // Calculate the nominator part, which depends on the label sequence.
-    ll += w[lbl[0]] /*start transition*/ + x[lbl[0]] +
-          w[tag_num + lbl[seq_length - 1]] /*end transition*/;
-    for (size_t k = 1; k < seq_length; ++k) {
-      ll += x[k * tag_num + lbl[k]] +
-            w[(lbl[k - 1] + state_trans_base_idx) * tag_num + lbl[k]];
-    }
-    return -ll;
-  }
-};
-
-template <typename T, typename DeviceContext>
-class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* label = ctx.Input<phi::DenseTensor>("Label");
-    const phi::DenseTensor* emission_exps =
-        ctx.Input<phi::DenseTensor>("EmissionExps");
-    const phi::DenseTensor* transition_exps =
-        ctx.Input<phi::DenseTensor>("TransitionExps");
-    const phi::DenseTensor* alpha = ctx.Input<phi::DenseTensor>("Alpha");
-    const T* ll_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("LogLikelihood"))
-            ->data<T>();
-    phi::DenseTensor* emission_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Emission"));
-    auto* emission_grad_data =
-        emission_grad->mutable_data<T>(platform::CPUPlace());
-    memset(emission_grad_data, 0, emission_grad->numel() * sizeof(T));
-    phi::DenseTensor alpha_tmp = *alpha;
-    phi::DenseTensor label_tmp = *label;
-    phi::DenseTensor emission_exps_tmp = *emission_exps;
-    phi::DenseTensor emission_grad_tmp = *emission_grad;
-    // getting seq_num  using padding or not
-    int64_t seq_num = 0;
-    framework::LoD in_lod;
-    const int64_t* length_data = nullptr;
-    if (ctx.HasInput("Length")) {
-      const phi::DenseTensor* label_length =
-          ctx.Input<phi::DenseTensor>("Length");
-      length_data = label_length->data<int64_t>();
-      seq_num = label_length->numel();
-      auto emission_dims = emission_grad->dims();
-      auto label_dims = label->dims();
-      emission_grad_tmp.Resize(
-          {emission_dims[0] * emission_dims[1], emission_dims[2]});
-      label_tmp.Resize({label_dims[0] * label_dims[1], 1});
-      alpha_tmp.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]});
-      emission_exps_tmp.Resize(
-          {emission_dims[0] * emission_dims[1], emission_dims[2]});
-    } else {
-      in_lod = ctx.Input<phi::DenseTensor>("Label")->lod();
-      PADDLE_ENFORCE_NE(
-          in_lod.size(),
-          0,
-          phi::errors::InvalidArgument("Input(Label) must be a sequence."));
-      seq_num = static_cast<int64_t>(in_lod[0].size() - 1);
-    }
-
-    phi::DenseTensor* transition_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Transition"));
-
-    // TODO(caoying) Fix this constraint. When the Input(Emission) is from the
-    // data reader operator, it can have no gradients.
-    if (transition_grad) {
-      transition_grad->mutable_data<T>(platform::CPUPlace());
-      phi::funcs::set_constant(
-          ctx.device_context(), transition_grad, static_cast<T>(0.));
-    }
-    // Now, all the inputs and outputs should be on the CPU memory.
-    auto emission_dims = emission_exps->dims();
-    // Beta is the memo table used in dynamic programming to calculate the
-    // backward vectors. For a backward vector i (the i-th row of beta), it
-    // captures the unnormalized probabilities of partial sequences starting
-    // at position i.
-    phi::DenseTensor beta;
-    beta.mutable_data<T>(emission_dims, platform::CPUPlace());
-    if (ctx.HasInput("Length")) {
-      beta.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]});
-    }
-
-    for (int64_t i = 0; i < seq_num; ++i) {
-      int64_t start_pos = 0;
-      int64_t end_pos = 0;
-      if (ctx.HasInput("Length")) {
-        start_pos = i * emission_dims[1];
-        end_pos = start_pos + length_data[i];
-      } else {
-        start_pos = static_cast<int64_t>(in_lod[0][i]);
-        end_pos = static_cast<int64_t>(in_lod[0][i + 1]);
-      }
-
-      if (end_pos == start_pos) {
-        continue;
-      }
-      const phi::DenseTensor one_seq_emission_exps =
-          emission_exps_tmp.Slice(start_pos, end_pos);
-      const phi::DenseTensor one_seq_label =
-          label_tmp.Slice(start_pos, end_pos);
-      const phi::DenseTensor one_seq_alpha =
-          alpha_tmp.Slice(start_pos, end_pos);
-      phi::DenseTensor one_seq_beta = beta.Slice(start_pos, end_pos);
-      phi::DenseTensor one_seq_emission_grad =
-          emission_grad_tmp.Slice(start_pos, end_pos);
-      BackwardOneSequence(ctx.template device_context<phi::CPUContext>(),
-                          ll_grad[i],
-                          one_seq_emission_exps,
-                          *transition_exps,
-                          one_seq_alpha,
-                          one_seq_label,
-                          &one_seq_beta,
-                          transition_grad,
-                          &one_seq_emission_grad);
-    }
-  };
-
- private:
-  void BackwardOneSequence(const phi::CPUContext& ctx,
-                           const T ll_grad,
-                           const phi::DenseTensor& emission_exps,
-                           const phi::DenseTensor& transition_exps,
-                           const phi::DenseTensor& alpha,
-                           const phi::DenseTensor& label,
-                           phi::DenseTensor* beta,
-                           phi::DenseTensor* transition_grad,
-                           phi::DenseTensor* emission_grad) const {
-    const T* w_exps = transition_exps.data<T>();
-    const T* x_exps = emission_exps.data<T>();
-    const int64_t* label_value = label.data<int64_t>();
-    T* beta_value = beta->data<T>();
-    auto x_dims = emission_exps.dims();
-    const size_t seq_length = x_dims[0];
-    const size_t tag_num = x_dims[1];
-    const size_t state_trans_base_idx = 2;
-
-    // Calculate the backward vectors: beta.
-    // First, calculate the initial state.
-    for (size_t i = 0; i < tag_num; ++i) {
-      beta_value[(seq_length - 1) * tag_num + i] = w_exps[tag_num + i];
-    }
-    NormalizeL1<T>(beta_value + (seq_length - 1) * tag_num, tag_num);
-    for (int k = static_cast<int>(seq_length) - 2; k >= 0; --k) {
-      for (size_t i = 0; i < tag_num; ++i) {
-        T sum = 0.;
-        for (size_t j = 0; j < tag_num; ++j) {
-          sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *  // (**)
-                 x_exps[(k + 1) * tag_num + j] *
-                 beta_value[(k + 1) * tag_num + j];
-        }
-        beta_value[k * tag_num + i] = sum;
-      }
-      // NormalizeL1 is to avoid underflow or overflow at (**).
-      NormalizeL1<T>(beta_value + k * tag_num, tag_num);
-    }
-
-    auto x_grad_mat = framework::EigenMatrix<T>::From(*emission_grad);
-    auto alpha_mat = framework::EigenMatrix<T>::From(alpha);
-    auto beta_mat = framework::EigenMatrix<T>::From(*beta);
-
-    auto* place = ctx.eigen_device();
-    auto prob = alpha_mat * beta_mat;
-    auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
-                       .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
-                       .broadcast(Eigen::DSizes<int, 2>(1, tag_num));
-    x_grad_mat.device(*place) =
-        (prob / row_sum).unaryExpr(ScalarMul<T>(ll_grad));
-
-    for (size_t k = 0; k < seq_length; ++k) {
-      x_grad_mat(k, label_value[k]) -= static_cast<T>(ll_grad);
-    }
-
-    if (transition_grad) {
-      T* trans_grad = transition_grad->data<T>();
-      for (size_t k = 0; k < tag_num; ++k) {
-        // Do not multiply by the output gradient here, because x_grad_mat has
-        // already done this.
-        trans_grad[k] += x_grad_mat(/*from start state*/ 0, k);
-        trans_grad[tag_num + k] +=
-            x_grad_mat(/*to end state*/ seq_length - 1, k);
-      }
-
-      auto x_exps_mat = framework::EigenMatrix<T>::From(emission_exps);
-
-      // TODO(caoying): Fix this to avoid using this local variable if we can
-      // profile the training process.
-      phi::DenseTensor tmp;
-      tmp.mutable_data<T>(beta->dims(), platform::CPUPlace());
-      auto tmp_mat = framework::EigenMatrix<T>::From(tmp);
-      auto prob = beta_mat * x_exps_mat;
-      auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
-                         .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
-                         .broadcast(Eigen::DSizes<int, 2>(1, tag_num));
-      tmp_mat.device(*place) = prob / row_sum;
-
-      for (size_t k = 1; k < seq_length; ++k) {
-        T sum = 0.;
-        for (size_t i = 0; i < tag_num; ++i) {
-          for (size_t j = 0; j < tag_num; ++j) {
-            sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *  // (**)
-                   alpha_mat(k - 1, i) * tmp_mat(k, j);
-          }
-        }
-        sum = 1. / sum;
-        for (size_t i = 0; i < tag_num; ++i) {
-          for (size_t j = 0; j < tag_num; ++j) {
-            trans_grad[(i + state_trans_base_idx) * tag_num + j] +=
-                sum * w_exps[(i + state_trans_base_idx) * tag_num + j] *
-                alpha_mat(k - 1, i) * tmp_mat(k, j) * ll_grad;
-          }
-        }
-        trans_grad[(label_value[k - 1] + state_trans_base_idx) * tag_num +
-                   label_value[k]] -= static_cast<T>(ll_grad);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/lite/CMakeLists.txt b/paddle/fluid/operators/lite/CMakeLists.txt
deleted file mode 100644
index ca3b62648378b..0000000000000
--- a/paddle/fluid/operators/lite/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-op_library(lite_engine_op DEPS lite_engine lite_tensor_utils)
diff --git a/paddle/fluid/operators/lite/lite_engine_op.cc b/paddle/fluid/operators/lite/lite_engine_op.cc
deleted file mode 100644
index 0ec1c55f7abee..0000000000000
--- a/paddle/fluid/operators/lite/lite_engine_op.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lite/lite_engine_op.h"
-
-#include <string>
-#include <vector>
-
-namespace paddle {
-
-namespace operators {
-
-class LiteEngineOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Xs", "A list of inputs.").AsDuplicable();
-    AddOutput("Ys", "A list of outputs.").AsDuplicable();
-    AddAttr<std::string>(
-        "engine_key",
-        "The engine_key here is used to distinguish different Lite Engines");
-    AddComment("Lite engine operator.");
-  }
-};
-
-class LiteInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lite_engine, ops::LiteEngineOp, ops::LiteEngineOpMaker);
diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h
deleted file mode 100644
index 756fec24d9874..0000000000000
--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/lite/engine.h"
-#include "paddle/fluid/inference/lite/tensor_utils.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-
-class LiteEngineOp : public framework::OperatorBase {
- private:
-  std::vector<std::string> in_names_;
-  std::vector<std::string> out_names_;
-  paddle::lite_api::PaddlePredictor *engine_;
-  framework::proto::VarType::Type precision_;
-  bool use_gpu_;
-  bool zero_copy_;
-
- public:
-  LiteEngineOp(const std::string &type,
-               const framework::VariableNameMap &inputs,
-               const framework::VariableNameMap &outputs,
-               const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {
-    in_names_ = Inputs("Xs");
-    out_names_ = Outputs("Ys");
-    engine_ =
-        inference::Singleton<inference::lite::EngineManager>::Global().Get(
-            Attr<std::string>("engine_key"));
-    if (Attr<bool>("enable_int8")) {
-      precision_ = framework::proto::VarType_Type_INT8;
-    } else {
-      precision_ = framework::proto::VarType_Type_FP32;
-    }
-    use_gpu_ = Attr<bool>("use_gpu");
-    zero_copy_ = Attr<bool>("zero_copy");
-  }
-
-  void SetEngine(paddle::lite_api::PaddlePredictor *engine) {
-    engine_ = engine;
-  }
-
- protected:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    Execute(scope, dev_place);
-  }
-
-  void Execute(const framework::Scope &scope,
-               const platform::Place &dev_place) const {
-    const platform::DeviceContext *ctx =
-        platform::DeviceContextPool::Instance().Get(dev_place);
-    for (size_t i = 0; i < in_names_.size(); i++) {
-      phi::DenseTensor src_t =
-          inference::analysis::GetFromScope<phi::DenseTensor>(scope,
-                                                              in_names_[i]);
-      paddle::lite_api::Tensor dst_t = *(engine_->GetInput(i));
-      VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> "
-              << engine_->GetInputNames()[i] << ")";
-      inference::lite::utils::TensorCopy(&dst_t, &src_t, *ctx, zero_copy_);
-    }
-    VLOG(3) << "lite engine run";
-    engine_->Run();
-    VLOG(3) << "lite engine run done";
-    for (size_t i = 0; i < out_names_.size(); i++) {
-      paddle::lite_api::Tensor src_t = *(engine_->GetOutput(i));
-      phi::DenseTensor *dst_t =
-          &inference::analysis::GetFromScope<phi::DenseTensor>(scope,
-                                                               out_names_[i]);
-      VLOG(3) << "== lite -> fluid (" << out_names_[i] << " -> "
-              << engine_->GetOutputNames()[i] << ")";
-      inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/CMakeLists.txt b/paddle/fluid/operators/metrics/CMakeLists.txt
deleted file mode 100644
index b968dbf288ee2..0000000000000
--- a/paddle/fluid/operators/metrics/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-include(operators)
-if(WITH_UNITY_BUILD)
-  # Load Unity Build rules for operators in paddle/fluid/operators/metrics.
-  include(unity_build_rule.cmake)
-endif()
-register_operators()
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.cc b/paddle/fluid/operators/metrics/precision_recall_op.cc
deleted file mode 100644
index 95a66cb2edd1d..0000000000000
--- a/paddle/fluid/operators/metrics/precision_recall_op.cc
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/metrics/precision_recall_op.h"
-
-namespace paddle {
-namespace operators {
-
-class PrecisionRecallOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("MaxProbs"),
-        true,
-        phi::errors::NotFound(
-            "PrecisionRecallOp Input(MaxProbs) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Indices"),
-        true,
-        phi::errors::NotFound(
-            "PrecisionRecallOp Input(Indices) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Labels"),
-        true,
-        phi::errors::NotFound(
-            "PrecisionRecallOp Input(Labels) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("BatchMetrics"),
-        true,
-        phi::errors::NotFound(
-            "PrecisionRecallOp Output(BatchMetrics) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("AccumMetrics"),
-        true,
-        phi::errors::NotFound(
-            "PrecisionRecallOp Output(AccumMetrics) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("AccumStatesInfo"),
-        true,
-        phi::errors::NotFound(
-            "PrecisionRecallOp Output(AccumStatesInfo) should not be null."));
-
-    int64_t cls_num =
-        static_cast<int64_t>(ctx->Attrs().Get<int>("class_number"));
-    auto max_probs_dims = ctx->GetInputDim("MaxProbs");
-    auto labels_dims = ctx->GetInputDim("Labels");
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(max_probs_dims[1],
-                        1,
-                        phi::errors::InvalidArgument(
-                            "Each instance of PrecisionRecallOp "
-                            "Input(MaxProbs) contains one max probability, "
-                            "the shape of Input(MaxProbs) should be "
-                            "[batch_size, 1], the 2nd dimension of "
-                            "Input(MaxProbs) should be 1. But the 2nd "
-                            "dimension we received is %d",
-                            max_probs_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          ctx->GetInputDim("Indices"),
-          max_probs_dims,
-          phi::errors::InvalidArgument(
-              "The shape of PrecisionRecallOp Input(Indices) should be same "
-              "with "
-              "max_probs_dims. But received the shape of Input(Indices) is "
-              "[%d, %d], max_probs_dims is [%d, %d]",
-              ctx->GetInputDim("Indices")[0],
-              ctx->GetInputDim("Indices")[1],
-              max_probs_dims[0],
-              max_probs_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          max_probs_dims[0],
-          labels_dims[0],
-          phi::errors::InvalidArgument(
-              "The 1st dimension of PrecisionRecallOp Input(MaxProbs) and "
-              "Input(Labels) both should be batch_size"
-              "But the 1st dimension we received max_probs_dims[0] = %d, "
-              "labels_dims[0] = %d",
-              max_probs_dims[0],
-              labels_dims[0]));
-      PADDLE_ENFORCE_EQ(labels_dims[1],
-                        1,
-                        phi::errors::InvalidArgument(
-                            "The 2nd dimension of PrecisionRecallOp "
-                            "Input(Labels) contains instance label and "
-                            "the shape should be equal to 1. But the 2nd "
-                            "dimension we received is %d",
-                            labels_dims[1]));
-    }
-    if (ctx->HasInput("Weights")) {
-      auto weights_dims = ctx->GetInputDim("Weights");
-
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(
-            weights_dims,
-            common::make_ddim({max_probs_dims[0], 1}),
-            phi::errors::InvalidArgument(
-                "The shape of PrecisionRecallOp Input(Weights) should be "
-                "[batch_size, 1]. But the shape we received is [%d, %d]",
-                weights_dims[0],
-                weights_dims[1]));
-      }
-    }
-    if (ctx->HasInput("StatesInfo")) {
-      auto states_dims = ctx->GetInputDim("StatesInfo");
-
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(
-            states_dims,
-            common::make_ddim({cls_num, 4}),
-            phi::errors::InvalidArgument(
-                "The shape of PrecisionRecallOp Input(StatesInfo) should be "
-                "[class_number, 4]. But the shape we received is [%d, %d]",
-                states_dims[0],
-                states_dims[1]));
-      }
-    }
-
-    // Layouts of BatchMetrics and AccumMetrics both are:
-    // [
-    //  macro average precision, macro average recall, macro average F1 score,
-    //  micro average precision, micro average recall, micro average F1 score
-    // ]
-    ctx->SetOutputDim("BatchMetrics", {6});
-    ctx->SetOutputDim("AccumMetrics", {6});
-    // Shape of AccumStatesInfo is [class_number, 4]
-    // The layout of each row is:
-    // [ TP, FP, TN, FN ]
-    ctx->SetOutputDim("AccumStatesInfo", {cls_num, 4});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "MaxProbs"),
-        ctx.GetPlace());
-  }
-};
-
-class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("MaxProbs",
-             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. Each row contains the max probability "
-             "of an instance which computed by the previous top_k (k=1) "
-             "operator.");
-    AddInput("Indices",
-             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. Each row contains the corresponding "
-             "index which computed by the previous top_k (k=1) operator.");
-    AddInput("Labels",
-             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. Each element is a label and the "
-             "value should be in [0, class_number - 1].");
-    AddInput("Weights",
-             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
-             "where N is the batch size. This input is optional. If provided, "
-             "weight of instance would be considered when computing metrics.")
-        .AsDispensable();
-    AddInput("StatesInfo",
-             "(Tensor, default Tensor<int>) A 2-D tensor with shape D x 4, "
-             "where D is the number of classes. This input is optional. If "
-             "provided, current state will be accumulated to this state and "
-             "the accumulation state will be the output state.")
-        .AsDispensable();
-    AddOutput("BatchMetrics",
-              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
-              "This output tensor contains metrics for current batch data. "
-              "The layout is [macro average precision, macro average recall, "
-              "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score].");
-    AddOutput("AccumMetrics",
-              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
-              "This output tensor contains metrics for accumulated data. "
-              "The layout is [macro average precision, macro average recall, "
-              "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score].");
-    AddOutput("AccumStatesInfo",
-              "(Tensor, default Tensor<float>) A 2-D tensor with shape D x 4, "
-              "where D is equal to class number. This output tensor contains "
-              "accumulated state variables used to compute metrics. The layout "
-              "for each class is [true positives, false positives, "
-              "true negatives, false negatives].");
-    AddAttr<int>("class_number", "(int) Number of classes to be evaluated.");
-    AddComment(R"DOC(
-Precision Recall Operator.
-
-When given Input(Indices) and Input(Labels), this operator can be used
-to compute various metrics including:
-1. macro average precision
-2. macro average recall
-3. macro f1 score
-4. micro average precision
-5. micro average recall
-6. micro f1 score
-
-To compute the above metrics, we need to do statistics for true positives,
-false positives and false negatives. Here the count of true negatives is not
-necessary, but counting it may provide potential usage and the cost is
-trivial, so the operator also provides the count of true negatives.
-
-We define state as a 2-D tensor with shape [class_number, 4]. Each row of a
-state contains statistic variables for corresponding class. Layout of each row
-is: TP(true positives), FP(false positives), TN(true negatives),
-FN(false negatives). If Input(Weights) is provided, TP, FP, TN, FN will be
-calculated by given weight instead of the instance count.
-
-This operator also supports metrics computing for cross-batch situation. To
-achieve this, Input(StatesInfo) should be provided. State of current batch
-data will be accumulated to Input(StatesInfo) and Output(AccumStatesInfo)
-is the accumulation state.
-
-Output(BatchMetrics) is metrics of current batch data while
-Output(AccumStatesInfo) is metrics of accumulation data.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    precision_recall,
-    ops::PrecisionRecallOp,
-    ops::PrecisionRecallOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-PD_REGISTER_STRUCT_KERNEL(precision_recall,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::PrecisionRecallKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.h b/paddle/fluid/operators/metrics/precision_recall_op.h
deleted file mode 100644
index 8a276d2fa5a32..0000000000000
--- a/paddle/fluid/operators/metrics/precision_recall_op.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-enum StateVariable { TP = 0, FP, TN, FN };
-
-template <typename T, typename DeviceContext>
-class PrecisionRecallKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in0 = ctx.Input<phi::DenseTensor>("Indices");
-    auto* in1 = ctx.Input<phi::DenseTensor>("Labels");
-    auto* in2 = ctx.Input<phi::DenseTensor>("Weights");
-    auto* in3 = ctx.Input<phi::DenseTensor>("StatesInfo");
-    auto* out0 = ctx.Output<phi::DenseTensor>("BatchMetrics");
-    auto* out1 = ctx.Output<phi::DenseTensor>("AccumMetrics");
-    auto* out2 = ctx.Output<phi::DenseTensor>("AccumStatesInfo");
-
-    const int* ids_data = in0->data<int>();
-    const int* labels_data = in1->data<int>();
-    size_t cls_num = static_cast<size_t>(ctx.Attr<int>("class_number"));
-    const T* weights_data = in2 ? in2->data<T>() : nullptr;
-    const T* states_data = in3 ? in3->data<T>() : nullptr;
-    double* batch_metrics_data = out0->mutable_data<double>(ctx.GetPlace());
-    double* accum_metrics_data = out1->mutable_data<double>(ctx.GetPlace());
-    out2->mutable_data<T>(ctx.GetPlace());
-    auto accum_states = EigenMatrix<T>::From(*out2);
-    accum_states.setZero();
-    T* accum_states_data = out2->data<T>();
-
-    size_t sample_num = in0->dims()[0];
-    size_t state_var_num = 4;  // TP FP TN FN
-
-    // get states info for current batch
-    for (size_t i = 0; i < sample_num; ++i) {
-      size_t idx = ids_data[i];
-      size_t label = labels_data[i];
-
-      PADDLE_ENFORCE_GE(
-          idx,
-          0,
-          phi::errors::InvalidArgument(
-              "Class index of each instance should be "
-              "greater than or equal to 0, But the index we received is %d",
-              idx));
-      PADDLE_ENFORCE_LT(idx,
-                        cls_num,
-                        phi::errors::InvalidArgument(
-                            "Class index of each instance should be less than "
-                            "cls_num = %d, But the index we received is %d",
-                            cls_num,
-                            idx));
-
-      PADDLE_ENFORCE_GE(label,
-                        0,
-                        phi::errors::InvalidArgument(
-                            "Label of each instance should be greater than or "
-                            "equal to 0, But the label we received is %d",
-                            label));
-      PADDLE_ENFORCE_LT(label,
-                        cls_num,
-                        phi::errors::InvalidArgument(
-                            "Label of each instance should be less than "
-                            "cls_num = %d, But the label we received is %d",
-                            cls_num,
-                            label));
-
-      T w = weights_data ? weights_data[i] : 1.0;
-      if (idx == label) {
-        accum_states_data[idx * state_var_num + TP] += w;
-        for (size_t j = 0; j < cls_num; ++j) {
-          accum_states_data[j * state_var_num + TN] += w;
-        }
-        accum_states_data[idx * state_var_num + TN] -= w;
-      } else {
-        accum_states_data[label * state_var_num + FN] += w;
-        accum_states_data[idx * state_var_num + FP] += w;
-        for (size_t j = 0; j < cls_num; ++j) {
-          accum_states_data[j * state_var_num + TN] += w;
-        }
-        accum_states_data[idx * state_var_num + TN] -= w;
-        accum_states_data[label * state_var_num + TN] -= w;
-      }
-    }
-
-    ComputeMetrics(
-        accum_states_data, batch_metrics_data, state_var_num, cls_num);
-
-    if (states_data) {
-      for (size_t i = 0; i < cls_num; ++i) {
-        for (size_t j = 0; j < state_var_num; ++j) {
-          size_t idx = i * state_var_num + j;
-          accum_states_data[idx] += states_data[idx];
-        }
-      }
-    }
-
-    ComputeMetrics(
-        accum_states_data, accum_metrics_data, state_var_num, cls_num);
-  }
-
-  // expose to be reused
-  static inline T CalcPrecision(T tp_count, T fp_count) {
-    if (tp_count > 0.0 || fp_count > 0.0) {
-      return tp_count / (tp_count + fp_count);
-    }
-    return 1.0;
-  }
-
-  static inline T CalcRecall(T tp_count, T fn_count) {
-    if (tp_count > 0.0 || fn_count > 0.0) {
-      return tp_count / (tp_count + fn_count);
-    }
-    return 1.0;
-  }
-
-  static inline T CalcF1Score(T precision, T recall) {
-    if (precision > 0.0 || recall > 0.0) {
-      return 2 * precision * recall / (precision + recall);
-    }
-    return 0.0;
-  }
-
- protected:
-  void ComputeMetrics(const T* states_data,
-                      double* metrics_data,
-                      size_t state_var_num,
-                      size_t cls_num) const {
-    T total_tp_count = 0;
-    T total_fp_count = 0;
-    T total_fn_count = 0;
-    T macro_avg_precision = 0.0;
-    T macro_avg_recall = 0.0;
-
-    for (size_t i = 0; i < cls_num; ++i) {
-      T tp_count = states_data[i * state_var_num + TP];
-      T fp_count = states_data[i * state_var_num + FP];
-      T fn_count = states_data[i * state_var_num + FN];
-      total_tp_count += tp_count;
-      total_fp_count += fp_count;
-      total_fn_count += fn_count;
-      macro_avg_precision += CalcPrecision(tp_count, fp_count);
-      macro_avg_recall += CalcRecall(tp_count, fn_count);
-    }
-    macro_avg_precision /= cls_num;
-    macro_avg_recall /= cls_num;
-    T macro_f1_score = CalcF1Score(macro_avg_precision, macro_avg_recall);
-
-    T micro_avg_precision = CalcPrecision(total_tp_count, total_fp_count);
-    T micro_avg_recall = CalcRecall(total_tp_count, total_fn_count);
-    T micro_f1_score = CalcF1Score(micro_avg_precision, micro_avg_recall);
-
-    // fill metrics data
-    metrics_data[0] = macro_avg_precision;
-    metrics_data[1] = macro_avg_recall;
-    metrics_data[2] = macro_f1_score;
-    metrics_data[3] = micro_avg_precision;
-    metrics_data[4] = micro_avg_recall;
-    metrics_data[5] = micro_f1_score;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/unity_build_rule.cmake b/paddle/fluid/operators/metrics/unity_build_rule.cmake
deleted file mode 100644
index dee8680cc93d3..0000000000000
--- a/paddle/fluid/operators/metrics/unity_build_rule.cmake
+++ /dev/null
@@ -1,7 +0,0 @@
-# This file records the Unity Build compilation rules.
-# The source files in a `register_unity_group` called are compiled in a unity
-# file.
-# Generally, the combination rules in this file do not need to be modified.
-# If there are some redefined error in compiling with the source file which
-# in combination rule, you can remove the source file from the following rules.
-register_unity_group(cc precision_recall_op.cc)
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
deleted file mode 100644
index 1726a8f818ec1..0000000000000
--- a/paddle/fluid/operators/minus_op.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/minus_op.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class MinusOp : public framework::OperatorWithKernel {
- public:
-  MinusOp(const std::string &type,
-          const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"),
-        true,
-        phi::errors::NotFound("Input(X) of MinusOp is not found."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Y"),
-        true,
-        phi::errors::NotFound("Input(Y) of MinusOp is not found."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"),
-        true,
-        phi::errors::NotFound("Output(Out) of MinusOp is not found."));
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    if (ctx->IsRuntime() ||
-        (common::product(x_dims) > 0 && common::product(y_dims) > 0)) {
-      PADDLE_ENFORCE_EQ(
-          x_dims,
-          y_dims,
-          phi::errors::InvalidArgument(
-              "Minus operator must take two tensor with same dim, but received "
-              "input X dim is:[%s], Y dim is:[%s]",
-              x_dims,
-              y_dims));
-    }
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The left tensor of minus operator.");
-    AddInput("Y", "The right tensor of minus operator.");
-    AddOutput("Out", "The output tensor of minus operator.");
-
-    AddComment(R"DOC(
-Minus Operator.
-
-Equation:
-
-    $Out = X - Y$
-
-Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input `X`.
-
-)DOC");
-  }
-};
-
-class MinusGradDescMaker : public framework::GradOpDescMakerBase {
- public:
-  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
-    std::vector<std::unique_ptr<framework::OpDesc>> ops;
-    auto x_g = this->InputGrad("X");
-    if (!x_g.empty()) {
-      auto *x_g_op = new framework::OpDesc();
-      x_g_op->SetType("scale");
-      x_g_op->SetInput("X", this->OutputGrad("Out"));
-      x_g_op->SetOutput("Out", x_g);
-      x_g_op->SetAttr("scale", 1.0f);
-      ops.emplace_back(x_g_op);
-    }
-
-    auto y_g = this->InputGrad("Y");
-    if (!y_g.empty()) {
-      auto *y_g_op = new framework::OpDesc();
-      y_g_op->SetType("scale");
-      y_g_op->SetInput("X", this->OutputGrad("Out"));
-      y_g_op->SetOutput("Out", y_g);
-      y_g_op->SetAttr("scale", -1.0f);
-      ops.emplace_back(y_g_op);
-    }
-
-    return ops;
-  }
-};
-
-class MinusGradMaker : public imperative::GradOpBaseMakerBase {
- public:
-  using imperative::GradOpBaseMakerBase::GradOpBaseMakerBase;
-
-  std::shared_ptr<imperative::GradOpNode> operator()() const override {
-    auto x_g = this->InputGrad("X");
-    auto y_g = this->InputGrad("Y");
-
-    auto node = this->NewGradNode();
-
-    if (!x_g.empty()) {
-      imperative::TracedGradOp op(node);
-      op.SetType("scale");
-      op.SetInput("X", this->OutputGrad("Out"));
-      op.SetOutput("Out", x_g);
-      op.SetAttr("scale", 1.0f);
-      op.SetDefaultAttrsMap(DefaultAttrsMap());
-    }
-
-    if (!y_g.empty()) {
-      imperative::TracedGradOp op(node);
-      op.SetType("scale");
-      op.SetInput("X", this->OutputGrad("Out"));
-      op.SetOutput("Out", y_g);
-      op.SetAttr("scale", -1.0f);
-      op.SetDefaultAttrsMap(DefaultAttrsMap());
-    }
-
-    return node;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(minus,
-                  ops::MinusOp,
-                  ops::MinusOpMaker,
-                  ops::MinusGradDescMaker,
-                  ops::MinusGradMaker);
-PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {}
-#endif
diff --git a/paddle/fluid/operators/minus_op.h b/paddle/fluid/operators/minus_op.h
deleted file mode 100644
index 2f900a2b16bc2..0000000000000
--- a/paddle/fluid/operators/minus_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class MinusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* left_tensor = context.Input<phi::DenseTensor>("X");
-    auto* right_tensor = context.Input<phi::DenseTensor>("Y");
-    auto* out_tensor = context.Output<phi::DenseTensor>("Out");
-
-    out_tensor->mutable_data<T>(context.GetPlace());
-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
-    phi::funcs::EigenSub<std::decay_t<decltype(dev)>, T>::Eval(
-        dev,
-        framework::EigenVector<T>::Flatten(*out_tensor),
-        framework::EigenVector<T>::Flatten(*left_tensor),
-        framework::EigenVector<T>::Flatten(*right_tensor));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt
index 629b41b4b582b..e46aa52527c06 100644
--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ b/paddle/fluid/operators/nccl/CMakeLists.txt
@@ -15,10 +15,3 @@ if(WITH_ROCM AND NOT WIN32)
     SRCS nccl_gpu_common.cc
     DEPS device_context operator)
 endif()
-
-if(WITH_GPU OR WITH_ROCM)
-  op_library(nccl_op DEPS nccl_common)
-  set(OPERATOR_DEPS
-      ${OPERATOR_DEPS} nccl_common
-      PARENT_SCOPE)
-endif()
diff --git a/paddle/fluid/operators/nccl/nccl_op.cc b/paddle/fluid/operators/nccl/nccl_op.cc
deleted file mode 100644
index dd3fd52d3b24d..0000000000000
--- a/paddle/fluid/operators/nccl/nccl_op.cc
+++ /dev/null
@@ -1,254 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr char kParallelScopes[] = "parallel_scopes";  // NOLINT
-
-// NCCLinitOp
-class NCCLInitOp : public framework::OperatorBase {
- public:
-  NCCLInitOp(const std::string &type,
-             const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        scope.FindVar(Input(kParallelScopes)),
-        phi::errors::NotFound("Can not find variable '%s' in the scope.",
-                              kParallelScopes));
-    const auto &name = Output("Communicator");
-    PADDLE_ENFORCE_NOT_NULL(
-        scope.FindVar(name),
-        phi::errors::NotFound("Output(%s) is needed for ncclInit operator.",
-                              name));
-    // A parallel do may not use all the gpus. For example, the batch size is 7
-    // in the last batch while we have 8 gpu. In this case, parallel_do will
-    // create 7 parallel scopes, so should ncclInitOp create 7 gpu peers
-    auto &parallel_scopes = scope.FindVar(Input(kParallelScopes))
-                                ->Get<std::vector<framework::Scope *>>();
-    std::vector<int> gpus(parallel_scopes.size());
-    for (int i = 0; i < static_cast<int>(parallel_scopes.size()); ++i) {
-      gpus[i] = i;
-    }
-    PADDLE_ENFORCE_EQ(!gpus.empty(),
-                      true,
-                      phi::errors::PreconditionNotMet(
-                          "gpus is empty, NCCL must init with gpus"));
-
-    platform::Communicator *comm =
-        scope.FindVar(name)->GetMutable<platform::Communicator>();
-    comm->InitAll(gpus);
-  }
-};
-
-class NCCLInitOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    ctx->SetOutputType("Communicator", framework::proto::VarType::RAW);
-  }
-};
-
-class NCCLInitOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(kParallelScopes, "The working place of parallel do.");
-    AddOutput("Communicator",
-              "Create Communicator for communicating between gpus");
-    AddComment(R"DOC(
-NCCLInit Operator.
-
-Create communicator.
-
-)DOC");
-  }
-};
-
-// AllReduceOp
-class NCCLAllReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NCCLAllReduce");
-    OP_INOUT_CHECK(ctx->HasInput("Communicator"),
-                   "Input",
-                   "Communicator",
-                   "NCCLAllReduce");
-
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLAllReduce");
-
-    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
-    PADDLE_ENFORCE_EQ((reduction == "ncclSum" || reduction == "ncclProd" ||
-                       reduction == "ncclMin" || reduction == "ncclMax"),
-                      true,
-                      phi::errors::InvalidArgument("invalid nccl reduction."));
-
-    auto x_dims = ctx->GetInputsDim("X");
-    ctx->SetOutputsDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-// AllReduceOp
-class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of AllReduce op");
-    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddOutput("Out", "The output of AllReduce op");
-    AddAttr<std::string>("reduction",
-                         "(string, default 'ncclSum') "
-                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
-        .SetDefault("ncclSum");
-    AddComment(R"DOC(
-NCCLAllReduce Operator.
-
-AllReduce the input tensors.
-
-)DOC");
-  }
-};
-
-// ReduceOp
-class NCCLReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NCCLReduce");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Communicator"), "Input", "Communicator", "NCCLReduce");
-
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLReduce");
-
-    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
-    PADDLE_ENFORCE_EQ((reduction == "ncclSum" || reduction == "ncclProd" ||
-                       reduction == "ncclMin" || reduction == "ncclMax"),
-                      true,
-                      phi::errors::InvalidArgument("invalid nccl reduction."));
-
-    auto x_dims = ctx->GetInputsDim("X");
-    ctx->SetOutputsDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-// ReduceOp
-class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of Reduce op");
-    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddOutput("Out", "The output of Reduce op");
-    AddAttr<std::string>("reduction",
-                         "(string, default 'ncclSum') "
-                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
-        .SetDefault("ncclSum");
-    AddAttr<int>("root",
-                 "(int, default kInvalidGPUId) "
-                 "Root gpu of the parameter. If not, "
-                 "set(platform::kInvalidGPUId). Hashed by name.")
-        .SetDefault(platform::kInvalidGPUId);
-    AddComment(R"DOC(
-NCCLReduce Operator.
-
-Reduce the tensors.
-
-)DOC");
-  }
-};
-
-// BcastOp
-class NCCLBcastOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NCCLBcast");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Communicator"), "Input", "Communicator", "NCCLBcast");
-
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLBcast");
-
-    int root = ctx->Attrs().Get<int>("root");
-    PADDLE_ENFORCE_EQ(root != platform::kInvalidGPUId,
-                      true,
-                      phi::errors::InvalidArgument("Bcast root must be set."));
-
-    auto x_dims = ctx->GetInputsDim("X");
-    ctx->SetOutputsDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-// BcastOp
-class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of BcastSend op");
-    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddOutput("Out", "The output of Bcast");
-    AddAttr<int>("root",
-                 "(int, default kInvalidGPUId) "
-                 "Root gpu of the parameter. If not, "
-                 "set(platform::kInvalidGPUId). Hashed by name.")
-        .SetDefault(platform::kInvalidGPUId);
-    AddComment(R"DOC(
-NCCLBcast Operator.
-
-Bcast the tensors.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    ncclInit,
-    ops::NCCLInitOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::NCCLInitOpMaker,
-    ops::NCCLInitOpVarTypeInference,
-    ops::NCCLInitOpShapeInference);
-
-REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce,
-                             ops::NCCLAllReduceOp,
-                             ops::NCCLAllReduceOpMaker);
-REGISTER_OP_WITHOUT_GRADIENT(ncclBcast,
-                             ops::NCCLBcastOp,
-                             ops::NCCLBcastOpMaker);
-REGISTER_OP_WITHOUT_GRADIENT(ncclReduce,
-                             ops::NCCLReduceOp,
-                             ops::NCCLReduceOpMaker);
diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc
deleted file mode 100644
index f1d6073a37231..0000000000000
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenseshashernless required by applicable law or agreed
-to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <functional>
-#include <unordered_map>
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::Communicator;
-
-template <typename Type>
-class NCCLTypeWrapper;
-
-template <>
-class NCCLTypeWrapper<float> {
- public:
-  static const ncclDataType_t type = ncclFloat;
-};
-
-template <>
-class NCCLTypeWrapper<double> {
- public:
-  static const ncclDataType_t type = ncclDouble;
-};
-
-static ncclRedOp_t str_to_nccl_red_type(std::string reduction) {
-  static const std::unordered_map<std::string, ncclRedOp_t> str_to_type = {
-      {"ncclSum", ncclSum},
-      {"ncclMin", ncclMin},
-      {"ncclMax", ncclMax},
-      {"ncclProd", ncclProd},
-  };
-  auto it = str_to_type.find(reduction);
-  PADDLE_ENFORCE_EQ(it != str_to_type.end(),
-                    true,
-                    phi::errors::InvalidArgument(
-                        "Invalid nccl reduction. Must be ncclMin | ncclMax | "
-                        "ncclProd | ncclSum"));
-  return it->second;
-}
-
-template <typename T, typename DeviceContext>
-class NCCLAllReduceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
-                      true,
-                      phi::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto* comm = ctx.Input<Communicator>("Communicator");
-    std::string reduction = ctx.Attr<std::string>("reduction");
-
-    auto reduction_op_ = str_to_nccl_red_type(reduction);
-
-    // device id
-    int gpu_id = ctx.GetPlace().GetDeviceId();
-    int idx = comm->GetCommId(gpu_id);
-    VLOG(3) << "gpu : "
-            << " invoke allreduce. send " << x->numel() << " recv "
-            << out->numel();
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::ncclAllReduce(x->data<T>(),
-                                         out->mutable_data<T>(ctx.GetPlace()),
-                                         out->numel(),
-                                         NCCLTypeWrapper<T>::type,
-                                         reduction_op_,
-                                         comm->comms().at(idx),
-                                         ctx.cuda_device_context().stream()));
-    VLOG(3) << "gpu : "
-            << " finished allreduce. send " << x->numel() << " recv "
-            << out->numel();
-  }
-};
-
-template <typename T, typename DeviceContext>
-class NCCLReduceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()),
-        true,
-        phi::errors::InvalidArgument("This kernel only runs on GPU device."));
-    auto x = ctx.Input<phi::DenseTensor>("X");  // x0, x1, x2
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    auto* comm = ctx.Input<Communicator>("Communicator");
-    int root = ctx.Attr<int>("root");
-    std::string reduction = ctx.Attr<std::string>("reduction");
-
-    auto reduction_op_ = str_to_nccl_red_type(reduction);
-
-    // device id
-    int gpu_id = ctx.GetPlace().GetDeviceId();
-    int idx = comm->GetCommId(gpu_id);
-    T* recvbuffer = nullptr;
-    if (root == gpu_id) {
-      recvbuffer = out->mutable_data<T>(ctx.GetPlace());
-    } else {
-      out->Resize(common::make_ddim({0}));
-    }
-    VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
-            << " recv " << out->numel();
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::ncclReduce(x->data<T>(),
-                                      recvbuffer,
-                                      x->numel(),
-                                      NCCLTypeWrapper<T>::type,
-                                      reduction_op_,
-                                      root,
-                                      comm->comms().at(idx),
-                                      ctx.cuda_device_context().stream()));
-    VLOG(3) << "gpu : " << gpu_id << " finished reduce. send " << x->numel()
-            << " recv " << out->numel();
-  }
-};
-
-template <typename T, typename DeviceContext>
-class NCCLBcastKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()),
-        true,
-        phi::errors::InvalidArgument("This kernel only runs on GPU device."));
-    int root = ctx.Attr<int>("root");
-    auto* comm = ctx.Input<Communicator>("Communicator");
-    // device id
-    int gpu_id = ctx.GetPlace().GetDeviceId();
-    int idx = comm->GetCommId(gpu_id);
-    if (idx == root) {
-      auto* x = ctx.Input<phi::DenseTensor>("X");
-      VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
-          reinterpret_cast<void*>(const_cast<T*>(x->data<T>())),
-          x->numel(),
-          NCCLTypeWrapper<T>::type,
-          root,
-          comm->comms().at(idx),
-          ctx.cuda_device_context().stream()));
-      VLOG(3) << "gpu : " << gpu_id << " finished Bcast.";
-    } else {
-      auto* out = ctx.Output<phi::DenseTensor>("Out");
-      VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
-              << common::product(out->dims());
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::ncclBcast(out->mutable_data<T>(ctx.GetPlace()),
-                                       out->numel(),
-                                       NCCLTypeWrapper<T>::type,
-                                       root,
-                                       comm->comms().at(idx),
-                                       ctx.cuda_device_context().stream()));
-      VLOG(3) << "gpu : " << gpu_id << " finished Bcast. recv " << out->numel();
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(
-    ncclAllReduce, GPU, ALL_LAYOUT, ops::NCCLAllReduceKernel, float) {}
-PD_REGISTER_STRUCT_KERNEL(
-    ncclBcast, GPU, ALL_LAYOUT, ops::NCCLBcastKernel, float) {}
-PD_REGISTER_STRUCT_KERNEL(
-    ncclReduce, GPU, ALL_LAYOUT, ops::NCCLReduceKernel, float) {}
diff --git a/paddle/fluid/operators/rank_attention.cu.h b/paddle/fluid/operators/rank_attention.cu.h
deleted file mode 100644
index 7077bd7a7aa4c..0000000000000
--- a/paddle/fluid/operators/rank_attention.cu.h
+++ /dev/null
@@ -1,218 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/common/dim.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-const int CUDA_NUM_THREADS = 1024;
-static inline int GET_BLOCKS(const int N) {
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-
-template <typename T>
-__global__ void expand_input_by_rank_kernel(const T* input,
-                                            int input_row,
-                                            int input_col,
-                                            T* output,
-                                            int output_row,
-                                            int output_col,
-                                            const int* rank_offset,
-                                            int rank_offset_row,
-                                            int rank_offset_col,
-                                            T* ins_rank,
-                                            int max_rank) {
-  CUDA_KERNEL_LOOP(idx, output_row * output_col) {
-    int output_col_idx = idx % output_col;
-    int output_row_idx = idx / output_col;
-    int k = output_col_idx / input_col;
-
-    int faster = rank_offset[output_row_idx * rank_offset_col + 2 * k + 1] - 1;
-    if (output_col_idx == 0) {
-      ins_rank[output_row_idx] = rank_offset[output_row_idx * rank_offset_col];
-    }
-
-    if (rank_offset[output_row_idx * rank_offset_col] - 1 < 0 || faster < 0) {
-      continue;
-    }
-
-    int rank_input_col_idx = output_col_idx % input_col;
-    int index = rank_offset[output_row_idx * rank_offset_col + 2 * k + 2];
-    output[idx] = input[rank_input_col_idx + index * input_col];
-  }
-}
-
-template <typename T>
-void expand_rank_attention_input(gpuStream_t stream,
-                                 const T* input,
-                                 int input_row,
-                                 int input_col,
-                                 T* output,
-                                 int output_row,
-                                 int output_col,
-                                 const int* rank_offset,
-                                 int rank_offset_row,
-                                 int rank_offset_col,
-                                 T* ins_rank,
-                                 int max_rank) {
-  expand_input_by_rank_kernel<<<GET_BLOCKS(output_row * output_col),
-                                CUDA_NUM_THREADS,
-                                0,
-                                stream>>>(input,
-                                          input_row,
-                                          input_col,
-                                          output,
-                                          output_row,
-                                          output_col,
-                                          rank_offset,
-                                          rank_offset_row,
-                                          rank_offset_col,
-                                          ins_rank,
-                                          max_rank);
-}
-
-template <typename T>
-__global__ void expand_rank_attention_param_kernel(const T* input,
-                                                   int input_row,
-                                                   int input_col,
-                                                   const int* rank_offset,
-                                                   int rank_offset_row,
-                                                   int rank_offset_col,
-                                                   const T* param,
-                                                   int param_row,
-                                                   int param_col,
-                                                   T* output_param,
-                                                   int output_param_row,
-                                                   int output_param_col,
-                                                   int max_rank) {
-  CUDA_KERNEL_LOOP(idx, output_param_row * output_param_col) {
-    int output_col_idx = idx % output_param_col;
-    int output_row_idx = idx / output_param_col;
-
-    int block_matrix_row = max_rank * input_col;
-    int ins_idx = output_row_idx / block_matrix_row;
-    int start_offset = output_row_idx % block_matrix_row;
-
-    int k = start_offset / input_col;
-    int k_offset = start_offset % input_col;
-
-    int lower = rank_offset[ins_idx * rank_offset_col] - 1;
-    int faster = rank_offset[2 * k + 1 + rank_offset_col * ins_idx] - 1;
-
-    if (lower < 0 || faster < 0) {
-      continue;
-    }
-    int start = lower * max_rank + faster;
-    int ori_idx =
-        start * param_col * input_col + k_offset * param_col + output_col_idx;
-    output_param[idx] = param[ori_idx];
-  }
-}
-
-template <typename T>
-void expand_rank_attention_param(gpuStream_t stream,
-                                 const T* input,
-                                 int input_row,
-                                 int input_col,
-                                 const int* rank_offset,
-                                 int rank_offset_row,
-                                 int rank_offset_col,
-                                 const T* param,
-                                 int param_row,
-                                 int param_col,
-                                 T* output_param,
-                                 int output_param_row,
-                                 int output_param_col,
-                                 int max_rank) {
-  expand_rank_attention_param_kernel<<<GET_BLOCKS(output_param_row *
-                                                  output_param_col),
-                                       CUDA_NUM_THREADS,
-                                       0,
-                                       stream>>>(input,
-                                                 input_row,
-                                                 input_col,
-                                                 rank_offset,
-                                                 rank_offset_row,
-                                                 rank_offset_col,
-                                                 param,
-                                                 param_row,
-                                                 param_col,
-                                                 output_param,
-                                                 output_param_row,
-                                                 output_param_col,
-                                                 max_rank);
-}
-
-template <typename T>
-__global__ void merge_param_gradient_kernel(T* expanded_grad,
-                                            int expanded_grad_row,
-                                            int expanded_grad_col,
-                                            T* param_grad,
-                                            int param_grad_row,
-                                            int param_grad_col,
-                                            const T* ins_rank,
-                                            int ins_num,
-                                            int max_rank,
-                                            int input_col) {
-  CUDA_KERNEL_LOOP(tid, param_grad_row * param_grad_col) {
-    int param_col_idx = tid % param_grad_col;
-    int param_row_idx = tid / param_grad_col;
-
-    int block_matrix_row = max_rank * input_col;
-    int rank_idx = param_row_idx / block_matrix_row;
-    int rank_offset = param_row_idx % block_matrix_row;
-
-    T tmp = 0;
-    for (int i = 0; i < ins_num; ++i) {
-      if (ins_rank[i] == rank_idx + 1) {
-        int row = i * block_matrix_row + rank_offset;
-        tmp += expanded_grad[row * expanded_grad_col + param_col_idx];
-      }
-    }
-    param_grad[tid] = tmp;
-  }
-}
-
-template <typename T>
-void merge_rank_attention_param_grad(gpuStream_t stream,
-                                     T* expanded_grad,
-                                     int expanded_grad_row,
-                                     int expanded_grad_col,
-                                     T* param_grad,
-                                     int param_grad_row,
-                                     int param_grad_col,
-                                     const T* ins_rank,
-                                     int ins_num,
-                                     int max_rank,
-                                     int input_col) {
-  merge_param_gradient_kernel<<<GET_BLOCKS(param_grad_row * param_grad_col),
-                                CUDA_NUM_THREADS,
-                                0,
-                                stream>>>(expanded_grad,
-                                          expanded_grad_row,
-                                          expanded_grad_col,
-                                          param_grad,
-                                          param_grad_row,
-                                          param_grad_col,
-                                          ins_rank,
-                                          ins_num,
-                                          max_rank,
-                                          input_col);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/rank_attention_op.cc b/paddle/fluid/operators/rank_attention_op.cc
deleted file mode 100644
index aaef2782f5e21..0000000000000
--- a/paddle/fluid/operators/rank_attention_op.cc
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/rank_attention_op.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_version_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class RankAttentionOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Input(X) of RankAttentionOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("RankOffset"),
-        true,
-        phi::errors::InvalidArgument(
-            "Input(RankOffset) of RankAttentionOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("RankParam"),
-        true,
-        phi::errors::InvalidArgument(
-            "Input(RankParam) of RankAttentionOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("InsRank"),
-        true,
-        phi::errors::InvalidArgument(
-            "Output(InsRank) of RankAttentionOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("InputHelp"),
-        true,
-        phi::errors::InvalidArgument(
-            "Output(InputHelp) of RankAttentionOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"),
-        true,
-        phi::errors::InvalidArgument(
-            "Output(Out) of RankAttentionOp should not be null."));
-    auto max_rank = ctx->Attrs().Get<int>("MaxRank");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto ins_num = x_dims[0];
-    auto param_dims = ctx->GetInputDim("RankParam");
-    auto para_col = param_dims[1];
-    auto rank_offset_dims = ctx->GetInputDim("RankOffset");
-    auto x_fea_dim = x_dims[1];
-    auto block_matrix_row = max_rank * x_fea_dim;
-
-    PADDLE_ENFORCE_EQ(
-        (rank_offset_dims[1] - 1) / 2,
-        max_rank,
-        phi::errors::InvalidArgument("Input(RankOffset) has wrong columns, "
-                                     "except columns to be %d, but got %d",
-                                     max_rank,
-                                     (rank_offset_dims[1] - 1) / 2));
-
-    ctx->SetOutputDim("Out", {ins_num, para_col});
-    ctx->SetOutputDim("InputHelp", {ins_num, block_matrix_row});
-    ctx->SetOutputDim("InsRank", {ins_num, 1});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class RankAttentionGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"),
-        true,
-        phi::errors::InvalidArgument("Input(X) should not be null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("RankParam"),
-        true,
-        phi::errors::InvalidArgument("Input(RankParam) should not be null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("RankOffset"),
-        true,
-        phi::errors::InvalidArgument("Input(RankOffset) should not be null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("InputHelp"),
-        true,
-        phi::errors::InvalidArgument("Input(InputHelp) should not be null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("InsRank"),
-        true,
-        phi::errors::InvalidArgument("Input(InsRank) should not be null"));
-
-    ctx->SetOutputDim(framework::GradVarName("RankParam"),
-                      ctx->GetInputDim("RankParam"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.GetPlace());
-  }
-};
-
-class RankAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor of rank_attention_Op operator.");
-    AddInput("RankOffset",
-             "(Tensor) Input tensor of rank_attention_Op operator.");
-    AddInput("RankParam",
-             "(Tensor) Input tensor of rank_attention_Op operator.");
-    AddOutput("InputHelp", "Output tensor of rank_attention_Op operator.")
-        .AsDispensable();
-    AddOutput("Out", "Output tensor of rank_attention_Op operator.");
-    AddOutput("InsRank", "Output tensor of rank_attention_Op operator.")
-        .AsDispensable();
-    AddAttr<int>("MaxRank", "(int, default 3) max rank of rank_attention_Op")
-        .SetDefault(3);
-    AddAttr<int>("MaxSize", "(int, default 0) max rank of rank_attention_Op")
-        .SetDefault(0);
-    AddComment(R"DOC(
-RankAttention Operator.
-This Op can calculate rank attention between input and rank_param,
-and rank_param gives the organization of data. Notice: It currently supports GPU device.
-This Op exists in contrib, which means that it is not shown to the public.
-)DOC");
-  }
-};
-
-template <typename T>
-class RankAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("rank_attention_grad");
-
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("RankOffset", this->Input("RankOffset"));
-    op->SetInput("RankParam", this->Input("RankParam"));
-    op->SetInput("InputHelp", this->Output("InputHelp"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetInput("InsRank", this->Output("InsRank"));
-
-    op->SetOutput(framework::GradVarName("RankParam"),
-                  this->InputGrad("RankParam"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(
-    RankAttentionGradOpNoNeedBufferVarsInference,
-    "X",
-    "RankOffset",
-    "RankParam");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(rank_attention,
-                  ops::RankAttentionOp,
-                  ops::RankAttentionOpMaker,
-                  ops::RankAttentionGradOpMaker<paddle::framework::OpDesc>,
-                  ops::RankAttentionGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(rank_attention_grad,
-                  ops::RankAttentionGradOp,
-                  ops::RankAttentionGradOpNoNeedBufferVarsInference);
-
-PD_REGISTER_STRUCT_KERNEL(
-    rank_attention, CPU, ALL_LAYOUT, ops::RankAttentionKernel, float, double) {}
-
-REGISTER_OP_VERSION(rank_attention)
-    .AddCheckpoint(
-        R"ROC(
-        Upgrade rank_attention, add 1 outputs [InputHelp] and 1 attribute
-        [MaxSize].
-      )ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewOutput("InputHelp",
-                       "Output tensor of rank_attention_Op operator "
-                       "in order to assist calculation in the reverse process.")
-            .NewAttr(
-                "MaxSize",
-                "Forward calculation to set the pre-applied video memory size",
-                0));
diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu
deleted file mode 100644
index d73de790a527e..0000000000000
--- a/paddle/fluid/operators/rank_attention_op.cu
+++ /dev/null
@@ -1,258 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/rank_attention.cu.h"
-#include "paddle/fluid/operators/rank_attention_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class RankAttentionCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *X = ctx.Input<phi::DenseTensor>("X");
-    auto *rank_offset = ctx.Input<phi::DenseTensor>("RankOffset");
-    auto *param = ctx.Input<phi::DenseTensor>("RankParam");
-    auto *input_help = ctx.Output<phi::DenseTensor>("InputHelp");
-    auto *ins_rank = ctx.Output<phi::DenseTensor>("InsRank");
-    int max_rank = ctx.Attr<int>("MaxRank");
-    int64_t max_size = ctx.Attr<int>("MaxSize");
-    auto *Out = ctx.Output<phi::DenseTensor>("Out");
-
-    // check dims
-    auto x_dims = X->dims();
-    auto ins_num = x_dims[0];
-    auto x_fea_dim = x_dims[1];
-    auto para_dims = param->dims();
-    auto para_row = para_dims[0];
-    auto para_col = para_dims[1];
-    auto rank_offset_dims = rank_offset->dims();
-    PADDLE_ENFORCE_EQ(
-        rank_offset_dims[0],
-        ins_num,
-        phi::errors::InvalidArgument("Input(RankOffset) has wrong rows."));
-    PADDLE_ENFORCE_EQ(
-        (rank_offset_dims[1] - 1) / 2,
-        max_rank,
-        phi::errors::InvalidArgument("Input(RankOffset) has wrong columns."));
-    PADDLE_ENFORCE_EQ(
-        max_rank * max_rank * x_fea_dim,
-        para_row,
-        phi::errors::InvalidArgument("Input(RankParam) has wrong rows."));
-
-    int block_matrix_row = max_rank * x_fea_dim;
-
-    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
-
-    int max_ins = std::max(ins_num, max_size);
-
-    phi::DenseTensor param_help;
-    param_help = ctx.AllocateTmpTensor<T, DeviceContext>(
-        {max_ins * block_matrix_row, para_col}, dev_ctx);
-    param_help.mutable_data<T>(ctx.GetPlace());
-
-    input_help->Resize({max_ins, block_matrix_row});
-    ins_rank->Resize({max_ins, 1});
-    input_help->mutable_data<T>(ctx.GetPlace());
-    ins_rank->mutable_data<T>(ctx.GetPlace());
-    Out->mutable_data<T>(ctx.GetPlace());
-
-    // initialize
-    auto param_help_eigen = framework::EigenVector<T>::Flatten(param_help);
-    auto input_help_eigen = framework::EigenVector<T>::Flatten(*input_help);
-    auto ins_rank_eigen = framework::EigenVector<T>::Flatten(*ins_rank);
-    auto out_eigen = framework::EigenVector<T>::Flatten(*Out);
-
-    auto &place =
-        *ctx.template device_context<phi::GPUContext>().eigen_device();
-
-    param_help_eigen.device(place) =
-        param_help_eigen.constant(static_cast<T>(0));
-    input_help_eigen.device(place) =
-        input_help_eigen.constant(static_cast<T>(0));
-    ins_rank_eigen.device(place) = ins_rank_eigen.constant(static_cast<T>(-1));
-    out_eigen.device(place) = out_eigen.constant(static_cast<T>(0));
-
-    // get data ptr
-    T *input_help_data = input_help->data<T>();
-    T *param_help_data = param_help.data<T>();
-    T *ins_rank_data = ins_rank->data<T>();
-    T *out_data = Out->data<T>();
-
-    expand_rank_attention_input(ctx.cuda_device_context().stream(),
-                                X->data<T>(),
-                                ins_num,
-                                x_fea_dim,
-                                input_help_data,
-                                ins_num,
-                                block_matrix_row,
-                                rank_offset->data<int>(),
-                                rank_offset_dims[0],
-                                rank_offset_dims[1],
-                                ins_rank_data,
-                                max_rank);
-
-    expand_rank_attention_param(ctx.cuda_device_context().stream(),
-                                X->data<T>(),
-                                ins_num,
-                                x_fea_dim,
-                                rank_offset->data<int>(),
-                                rank_offset_dims[0],
-                                rank_offset_dims[1],
-                                param->data<T>(),
-                                para_row,
-                                para_col,
-                                param_help_data,
-                                ins_num * block_matrix_row,
-                                para_col,
-                                max_rank);
-
-    CBLAS_TRANSPOSE transA = CblasNoTrans;
-    CBLAS_TRANSPOSE transB = CblasNoTrans;
-
-    T alpha = 1;
-    T beta = 0;
-    int64_t strideA = block_matrix_row;
-    int64_t strideB = block_matrix_row * para_col;
-
-    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
-    blas.BatchedGEMM(transA,
-                     transB,
-                     1,
-                     para_col,
-                     block_matrix_row,
-                     alpha,
-                     input_help_data,
-                     param_help_data,
-                     beta,
-                     out_data,
-                     ins_num,
-                     strideA,
-                     strideB);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *X = ctx.Input<phi::DenseTensor>("X");  // not use data
-    auto *rank_offset =
-        ctx.Input<phi::DenseTensor>("RankOffset");           // not use data
-    auto *param = ctx.Input<phi::DenseTensor>("RankParam");  // not use data
-    auto *input_help = ctx.Input<phi::DenseTensor>("InputHelp");
-    auto *ins_rank = ctx.Input<phi::DenseTensor>("InsRank");
-    auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    int64_t max_size = ctx.Attr<int>("MaxSize");
-
-    auto *drank_para =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("RankParam"));
-
-    // get dim
-    auto x_dims = X->dims();
-    auto ins_num = x_dims[0];
-    auto x_fea_dim = x_dims[1];
-    auto para_dims = param->dims();
-    auto para_row = para_dims[0];
-    auto para_col = para_dims[1];
-    auto rank_offset_dims = rank_offset->dims();
-    auto max_rank = (rank_offset_dims[1] - 1) / 2;
-    int block_matrix_row = max_rank * x_fea_dim;
-    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
-    auto &place =
-        *ctx.template device_context<phi::GPUContext>().eigen_device();
-
-    int max_ins = std::max(ins_num, max_size);
-    // initialize out grad
-    drank_para->mutable_data<T>(ctx.GetPlace());
-    auto drank_para_eigen = framework::EigenVector<T>::Flatten(*drank_para);
-    drank_para_eigen.device(place) =
-        drank_para_eigen.constant(static_cast<T>(0));
-
-    // copy data
-    phi::DenseTensor param_grad;
-    param_grad = ctx.AllocateTmpTensor<T, DeviceContext>(
-        {max_ins * block_matrix_row, para_col}, dev_ctx);
-    param_grad.mutable_data<T>(ctx.GetPlace());
-    // initialize
-    auto param_grad_eigen = framework::EigenVector<T>::Flatten(param_grad);
-    param_grad_eigen.device(place) =
-        param_grad_eigen.constant(static_cast<T>(0));
-    // get data ptr
-    const T *input_help_data = input_help->data<T>();
-    const T *ins_rank_data = ins_rank->data<T>();
-    T *param_grad_data = param_grad.data<T>();
-
-    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
-    T alpha = 1;
-    T beta = 0;
-
-    // get param_grad
-    CBLAS_TRANSPOSE transA = CblasTrans;
-    CBLAS_TRANSPOSE transB = CblasNoTrans;
-    int64_t strideA = block_matrix_row;
-    int64_t strideB = para_col;
-    blas.BatchedGEMM(transA,
-                     transB,
-                     block_matrix_row,
-                     para_col,
-                     1,
-                     alpha,
-                     input_help_data,
-                     dout->data<T>(),
-                     beta,
-                     param_grad_data,
-                     ins_num,
-                     strideA,
-                     strideB);
-    // merge param_grad to get drank_para
-    merge_rank_attention_param_grad(ctx.cuda_device_context().stream(),
-                                    param_grad_data,
-                                    ins_num * block_matrix_row,
-                                    para_col,
-                                    drank_para->data<T>(),
-                                    para_row,
-                                    para_col,
-                                    ins_rank_data,
-                                    ins_num,
-                                    max_rank,
-                                    x_fea_dim);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(rank_attention,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::RankAttentionCUDAKernel,
-                          float,
-                          double) {}
-
-PD_REGISTER_STRUCT_KERNEL(rank_attention_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::RankAttentionGradOpCUDAKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/rank_attention_op.h b/paddle/fluid/operators/rank_attention_op.h
deleted file mode 100644
index f119c4a2f315c..0000000000000
--- a/paddle/fluid/operators/rank_attention_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class RankAttentionKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()),
-        true,
-        phi::errors::Unimplemented("Rank Attention only supports GPU now."));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/similarity_focus_op.cc b/paddle/fluid/operators/similarity_focus_op.cc
deleted file mode 100644
index 4889dd9dfbf6b..0000000000000
--- a/paddle/fluid/operators/similarity_focus_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/similarity_focus_op.h"
-
-namespace paddle {
-namespace operators {
-class SimilarityFocusOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), a 4-D tensor with shape,"
-             " [BatchSize, X, Y, Z]");
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>), the similarity focus mask"
-              " with the same shape of input X.");
-    AddAttr<int>("axis",
-                 "(int32), indicating the dimension to be select. It can"
-                 " only be 1, 2, or 3.");
-    AddAttr<std::vector<int>>("indexes",
-                              "(std::vector<int32>), indicating the indexes"
-                              " of the selected dimension.");
-    AddComment(R"DOC(
-SimilarityFocus Operator.
-
-Generate a similarity focus mask with the same shape of input using the following method:
-1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
-   to the axis according to the indexes. For example, if axis=1 and indexes=[a],
-   it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
-   is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
-2. For each index, find the largest numbers in the tensor T, so that the same
-   row and same column has at most one number(what it means is that if the
-   largest number has been found in the i-th row and the j-th column, then
-   the numbers in the i-th row or j-th column will be skipped. And then the
-   next largest number will be selected from the remaining numbers. Obviously
-   there will be min(B, C) numbers), and mark the corresponding position of the
-   3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for
-   each index.
-3. Broadcast the 3-D similarity focus mask to the same shape of input X.
-
-Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
-)DOC");
-  }
-};
-
-class SimilarityFocusOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SimilarityFocus");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SimilarityFocus");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(),
-        4,
-        phi::errors::InvalidArgument(
-            "The dimension size of Input(X) be 4, but received %d.",
-            x_dims.size()));
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          platform::CPUPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    similarity_focus,
-    ops::SimilarityFocusOp,
-    ops::SimilarityFocusOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-PD_REGISTER_STRUCT_KERNEL(similarity_focus,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SimilarityFocusKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/similarity_focus_op.h b/paddle/fluid/operators/similarity_focus_op.h
deleted file mode 100644
index eea1d1953a4b9..0000000000000
--- a/paddle/fluid/operators/similarity_focus_op.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <cstring>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class SimilarityFocusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    phi::DenseTensor* out = context.Output<phi::DenseTensor>("Out");
-    const phi::DenseTensor* x = context.Input<phi::DenseTensor>("X");
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    const T* x_data = x->data<T>();
-
-    int axis = context.Attr<int>("axis");
-    std::vector<int> indexes = context.Attr<std::vector<int>>("indexes");
-
-    int64_t batch_size = x->dims()[0];
-    int64_t dim[4];
-    for (int i = 1; i <= 3; ++i) {
-      dim[i] = x->dims()[i];
-    }
-
-    PADDLE_ENFORCE_GT(
-        indexes.size(),
-        0,
-        phi::errors::InvalidArgument("The size of Attr(indexes) must be "
-                                     "greater than 0, but received %d.",
-                                     indexes.size()));
-
-    for (size_t i = 0; i < indexes.size(); i++) {
-      PADDLE_ENFORCE_GT(
-          dim[axis],
-          indexes[i],
-          phi::errors::InvalidArgument(
-              "Each value of Attr(indexes) must be less than X.dim[axis], "
-              "but indexes[%d] received %d.",
-              i,
-              indexes[i]));
-    }
-
-    int64_t array_size = 1;
-    for (int i = 1; i <= 3; ++i) {
-      if (i != axis) {
-        array_size *= dim[i];
-      }
-    }
-
-    std::vector<std::pair<T, int64_t>> array(array_size);
-
-    bool (*cmp)(std::pair<T, int64_t>, std::pair<T, int64_t>) =
-        [](std::pair<T, int64_t> x, std::pair<T, int64_t> y) {
-          return x.first > y.first;
-        };
-
-    int64_t (*compute_index)(int64_t*, int, int, int, int) =
-        [](int64_t* dim, int d1, int d2, int d3, int d4) {
-          return d1 * dim[1] * dim[2] * dim[3] + d2 * dim[2] * dim[3] +
-                 d3 * dim[3] + d4;
-        };
-
-    PADDLE_ENFORCE_GT(
-        axis,
-        0,
-        phi::errors::InvalidArgument(
-            "The value of Attr(axis) must be 1 or 2 or 3, but received %d.",
-            axis));
-    PADDLE_ENFORCE_LT(
-        axis,
-        4,
-        phi::errors::InvalidArgument(
-            "The value of Attr(axis) must be 1 or 2 or 3, but received %d.",
-            axis));
-    memset(out_data, 0, sizeof(T) * batch_size * dim[1] * dim[2] * dim[3]);
-    for (int i = 0; i < batch_size; ++i) {
-      for (auto index : indexes) {
-        if (axis == 1) {
-          for (int j = 0; j < dim[2]; ++j) {
-            for (int k = 0; k < dim[3]; ++k) {
-              array[j * dim[3] + k] = std::make_pair(
-                  x_data[compute_index(dim, i, index, j, k)], j * dim[3] + k);
-            }
-          }
-
-          std::sort(array.begin(), array.end(), cmp);
-          int tag_num = 0;
-          std::vector<bool> tag2(dim[2]), tag3(dim[3]);
-          for (auto x : array) {
-            int idx2 = x.second / dim[3];
-            int idx3 = x.second % dim[3];
-            if (tag2[idx2] || tag3[idx3]) {
-              continue;
-            }
-            tag_num++;
-            tag2[idx2] = true;
-            tag3[idx3] = true;
-            for (int j = 0; j < dim[1]; ++j) {
-              out_data[compute_index(dim, i, j, idx2, idx3)] = 1;
-            }
-            if (tag_num == std::min(dim[2], dim[3])) {
-              break;
-            }
-          }
-        } else if (axis == 2) {
-          for (int j = 0; j < dim[1]; ++j) {
-            for (int k = 0; k < dim[3]; ++k) {
-              array[j * dim[3] + k] = std::make_pair(
-                  x_data[compute_index(dim, i, j, index, k)], j * dim[3] + k);
-            }
-          }
-
-          std::sort(array.begin(), array.end(), cmp);
-          int tag_num = 0;
-          std::vector<bool> tag1(dim[1]), tag3(dim[3]);
-          for (auto x : array) {
-            int idx1 = x.second / dim[3];
-            int idx3 = x.second % dim[3];
-            if (tag1[idx1] || tag3[idx3]) {
-              continue;
-            }
-            tag_num++;
-            tag1[idx1] = true;
-            tag3[idx3] = true;
-            for (int j = 0; j < dim[2]; ++j) {
-              out_data[compute_index(dim, i, idx1, j, idx3)] = 1;
-            }
-            if (tag_num == std::min(dim[1], dim[3])) {
-              break;
-            }
-          }
-        } else if (axis == 3) {
-          for (int j = 0; j < dim[1]; ++j) {
-            for (int k = 0; k < dim[2]; ++k) {
-              array[j * dim[2] + k] = std::make_pair(
-                  x_data[compute_index(dim, i, j, k, index)], j * dim[2] + k);
-            }
-          }
-
-          std::sort(array.begin(), array.end(), cmp);
-          int tag_num = 0;
-          std::vector<bool> tag1(dim[1]), tag2(dim[2]);
-          for (auto x : array) {
-            int idx1 = x.second / dim[2];
-            int idx2 = x.second % dim[2];
-            if (tag1[idx1] || tag2[idx2]) {
-              continue;
-            }
-            tag_num++;
-            tag1[idx1] = true;
-            tag2[idx2] = true;
-            for (int j = 0; j < dim[3]; ++j) {
-              out_data[compute_index(dim, i, idx1, idx2, j)] = 1;
-            }
-            if (tag_num == std::min(dim[1], dim[2])) {
-              break;
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
deleted file mode 100644
index ad2ded506cd85..0000000000000
--- a/paddle/fluid/operators/spp_op.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/spp_op.h"
-
-#include <string>
-#include <vector>
-namespace paddle {
-namespace operators {
-
-class SppOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor of spp operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of feature.");
-    AddOutput("Out",
-              "(Tensor) The output tensor of spp operator."
-              "N * M."
-              "M = C * H * W");
-    AddAttr<int>("pyramid_height", "(int), multi level pooling");
-    AddAttr<std::string>(
-        "pooling_type",
-        "(string), pooling type, can be \"max\" for max-pooling "
-        "and \"avg\" for average-pooling.")
-        .InEnum({"max", "avg"});
-    AddComment(R"DOC(
-        "With spatial pyramid pooling, the input image can
-        be of any sizes. This not only allows arbitrary aspect
-        ratios, but also allows arbitrary scales. We can resize
-        the input image to any scale (e.g., min(w, h)=180, 224,
-        ...) and apply the same deep network. When the
-        input image is at different scales, the network (with
-        the same filter sizes) will extract features at different
-        scales. The scales play important roles in traditional
-        methods.
-        Input shape: $(N, C_{in}, H_{in}, W_{in})$
-        Output shape: $(H_{out}, W_{out})$
-        Where
-          $$
-            H_{out} = N \\
-            W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in}
-          $$
-        paper https://arxiv.org/pdf/1406.4729v4.pdf
-        )DOC");
-  }
-};
-
-class SppOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"),
-        true,
-        phi::errors::InvalidArgument("Input(X) of SppOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Output(Out) of SppOp should not be null."));
-    auto in_x_dims = ctx->GetInputDim("X");
-    int pyramid_height = ctx->Attrs().Get<int>("pyramid_height");
-    PADDLE_ENFORCE_EQ(in_x_dims.size(),
-                      4,
-                      phi::errors::InvalidArgument(
-                          "Spping intput must be of 4-dimensional."));
-    int outlen =
-        ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1];  // NOLINT
-    std::vector<int64_t> output_shape({in_x_dims[0], outlen});
-    ctx->SetOutputDim("Out", common::make_ddim(output_shape));
-  }
-};
-
-class SppOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"),
-        true,
-        phi::errors::InvalidArgument("Input(X) must not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput(framework::GradVarName("X")),
-        true,
-        phi::errors::InvalidArgument("Input(X@GRAD) should not be null."));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    spp,
-    ops::SppOp,
-    ops::SppOpMaker,
-    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
-    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
-REGISTER_OPERATOR(spp_grad, ops::SppOpGrad);
-
-PD_REGISTER_STRUCT_KERNEL(spp, CPU, ALL_LAYOUT, ops::SppKernel, float, double) {
-}
-PD_REGISTER_STRUCT_KERNEL(
-    spp_grad, CPU, ALL_LAYOUT, ops::SppGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/spp_op.cu.cc b/paddle/fluid/operators/spp_op.cu.cc
deleted file mode 100644
index b41fa8ae5fcf7..0000000000000
--- a/paddle/fluid/operators/spp_op.cu.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/spp_op.h"
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(spp, GPU, ALL_LAYOUT, ops::SppKernel, float, double) {
-}
-PD_REGISTER_STRUCT_KERNEL(
-    spp_grad, GPU, ALL_LAYOUT, ops::SppGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
deleted file mode 100644
index 5d3f4a78020a0..0000000000000
--- a/paddle/fluid/operators/spp_op.h
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/phi_utils.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/pooling.h"
-#include "paddle/phi/kernels/funcs/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {
-template <typename T, typename DeviceContext>
-class SppKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const phi::DenseTensor* in_x = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    int pyramid_height = context.template Attr<int>("pyramid_height");
-    std::string pooling_type =
-        context.template Attr<std::string>("pooling_type");
-    out->mutable_data<T>(context.GetPlace());
-    auto out_stride = common::stride(out->dims());
-    int input_h = in_x->dims()[2];
-    int input_w = in_x->dims()[3];
-    size_t output_offset = 0;
-    for (int p = 0; p < pyramid_height; ++p) {
-      int bins = std::pow(2, p);
-      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
-      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
-      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
-      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
-      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
-      std::vector<int> strides({kernel_size_h, kernel_size_w});
-      std::vector<int> paddings({padding_h, padding_w});
-      // pooling output shape
-      phi::DenseTensor out_level;
-      std::vector<int64_t> output_shape_vec(
-          {in_x->dims()[0], in_x->dims()[1], bins, bins});
-      framework::DDim output_shape(common::make_ddim(output_shape_vec));
-      out_level.mutable_data<T>(output_shape, context.GetPlace());
-      // pooling
-      if (pooling_type == "max") {
-        phi::funcs::Pool2dFunctor<
-            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
-            phi::funcs::MaxPool<T>,
-            T>
-            pool_forward;
-        phi::funcs::MaxPool<T> max_process;
-        pool_forward(context.template device_context<DeviceContext>(),
-                     *in_x,
-                     kernel_size,
-                     strides,
-                     paddings,
-                     true,
-                     false,
-                     &out_level,
-                     max_process);
-      } else if (pooling_type == "avg") {
-        phi::funcs::Pool2dFunctor<
-            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
-            phi::funcs::AvgPool<T>,
-            T>
-            pool_forward;
-        phi::funcs::AvgPool<T> avg_process;
-        pool_forward(context.template device_context<DeviceContext>(),
-                     *in_x,
-                     kernel_size,
-                     strides,
-                     paddings,
-                     true,
-                     false,
-                     &out_level,
-                     avg_process);
-      }
-      // flatten pooling output shape
-      int output_flatten_w = in_x->dims()[1] * bins * bins;
-      std::vector<int64_t> output_flatten_shape_vec(
-          {in_x->dims()[0], output_flatten_w});
-      framework::DDim output_flatten_shape(
-          common::make_ddim(output_flatten_shape_vec));
-      out_level.Resize(output_flatten_shape);
-      // concat
-      auto out_level_stride = common::stride(out_level.dims());
-      phi::funcs::StridedMemcpy<T>(
-          context.template device_context<DeviceContext>(),
-          out_level.data<T>(),
-          out_level_stride,
-          out_level.dims(),
-          out_stride,
-          out->data<T>() + output_offset);
-      output_offset += out_level.dims()[1] * out_level_stride[1];
-    }
-  }
-};
-template <typename T, typename DeviceContext>
-class SppGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const phi::DenseTensor* in_x = context.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor* out = context.Input<phi::DenseTensor>("Out");
-    const phi::DenseTensor* out_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    phi::DenseTensor* in_x_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    int pyramid_height = context.template Attr<int>("pyramid_height");
-    std::string pooling_type =
-        context.template Attr<std::string>("pooling_type");
-    auto& device_ctx = context.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<
-        typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
-        T>
-        zero;
-    in_x_grad->mutable_data<T>(context.GetPlace());
-    zero(device_ctx, in_x_grad, static_cast<T>(0));
-    auto out_stride = common::stride(out->dims());
-    int input_h = in_x->dims()[2];
-    int input_w = in_x->dims()[3];
-    size_t out_offset = 0;
-    for (int p = 0; p < pyramid_height; ++p) {
-      int bins = std::pow(2, p);
-      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
-      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
-      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
-      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
-      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
-      std::vector<int> strides({kernel_size_h, kernel_size_w});
-      std::vector<int> paddings({padding_h, padding_w});
-      // split out and outgrad  ...  to flatten
-      phi::DenseTensor out_level;
-      phi::DenseTensor outgrad_level;
-      int out_flatten_w = in_x->dims()[1] * bins * bins;
-      std::vector<int64_t> out_flatten_shape_vec(
-          {in_x->dims()[0], out_flatten_w});
-      framework::DDim out_flatten_shape(
-          common::make_ddim(out_flatten_shape_vec));
-      out_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
-      outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
-      auto flatten_stride = common::stride(out_level.dims());
-      // memcpy
-      phi::funcs::StridedMemcpy<T>(
-          context.template device_context<DeviceContext>(),
-          out->data<T>() + out_offset,
-          out_stride,
-          out_level.dims(),
-          flatten_stride,
-          out_level.data<T>());
-
-      phi::funcs::StridedMemcpy<T>(
-          context.template device_context<DeviceContext>(),
-          out_grad->data<T>() + out_offset,
-          out_stride,
-          outgrad_level.dims(),
-          flatten_stride,
-          outgrad_level.data<T>());
-      out_offset += out_level.dims()[1] * out_stride[1];
-      // flatten backward to nchw
-
-      std::vector<int64_t> out_shape_vec({in_x->dims()[0], in_x->dims()[1]});
-      out_shape_vec.push_back(
-          (input_h - kernel_size_h + 2 * padding_h) / kernel_size_h + 1);
-      out_shape_vec.push_back(
-          (input_w - kernel_size_w + 2 * padding_w) / kernel_size_w + 1);
-      framework::DDim out_shape(common::make_ddim(out_shape_vec));
-      out_level.ShareDataWith(out_level);
-      out_level.Resize(out_shape);
-      outgrad_level.ShareDataWith(outgrad_level);
-      outgrad_level.Resize(out_shape);
-      // pooling backward
-      if (pooling_type == "max") {
-        phi::funcs::MaxPool2dGradFunctor<
-            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
-            T>
-            pool2d_backward;
-        pool2d_backward(context.template device_context<DeviceContext>(),
-                        *in_x,
-                        *&out_level,
-                        *&outgrad_level,
-                        kernel_size,
-                        strides,
-                        paddings,
-                        in_x_grad);
-      } else if (pooling_type == "avg") {
-        phi::funcs::Pool2dGradFunctor<
-            typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
-            phi::funcs::AvgPoolGrad<T>,
-            T>
-            pool_backward;
-        phi::funcs::AvgPoolGrad<T> avg_process;
-        pool_backward(context.template device_context<DeviceContext>(),
-                      *in_x,
-                      *&out_level,
-                      *&outgrad_level,
-                      kernel_size,
-                      strides,
-                      paddings,
-                      true,
-                      false,
-                      in_x_grad,
-                      avg_process);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tdm_child_op.cc b/paddle/fluid/operators/tdm_child_op.cc
deleted file mode 100644
index e14dc0e316219..0000000000000
--- a/paddle/fluid/operators/tdm_child_op.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include "paddle/fluid/operators/tdm_child_op.h"
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/sampler.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-class TDMChildOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "X(Tensor), dtype support int32/int64, X variable is the "
-             "node id of TDM-Tree");
-    AddInput(
-        "TreeInfo",
-        "TreeInfo(Tensor), dtype support int32/int64, it stores the node "
-        "information in the following format: item_id(shape=1), "
-        "layer_id(shape=1), parent_id(shape=1), child_id(shape=child_nums)");
-    AddAttr<int>("child_nums",
-                 "child_nums(int)"
-                 "The child nums of one node, if the node hasn't enough child, "
-                 "it should padding 0 until child nums equal to child_nums");
-    AddOutput("Child",
-              "Return the children's node_id of input node, "
-              "if input don't have child, return 0");
-    AddOutput("LeafMask",
-              "LeafMask has the same shape with Child"
-              "If child is leaf node, LeafMask value = 1, else = 0");
-    AddAttr<int>("dtype",
-                 "(int, default INT32) "
-                 "Output data type.")
-        .SetDefault(2);
-    AddComment(R"DOC("
-     **Tdm Child**
-     According to the input node_id on the given tree, return the corresponding child node_id and
-      whether child is a leaf node by LeafMask.")DOC");
-  }
-};
-
-class TDMChildOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Inputs(X) of TdmChild should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("TreeInfo"),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Inputs(TreeInfo) of TdmChild should not be null."));
-
-    int child_nums = ctx->Attrs().Get<int>("child_nums");
-    PADDLE_ENFORCE_GT(
-        child_nums,
-        0,
-        phi::errors::InvalidArgument(
-            "ValueError: The value of the 'child_nums' must greater than 0. "
-            "But received child_nums value = %d, ",
-            child_nums));
-
-    auto info_dims = ctx->GetInputDim("TreeInfo");
-    auto input_dims = ctx->GetInputDim("X");
-
-    PADDLE_ENFORCE_EQ(
-        info_dims.size(),
-        2,
-        phi::errors::InvalidArgument(
-            "ShapeError: The dimensions of the 'tree info' must be 2. "
-            "But received tree info's dimensions = %d, "
-            "tree info's shape = [%s].",
-            info_dims.size(),
-            info_dims));
-
-    auto output_dims = common::vectorize(input_dims);
-    output_dims.push_back(child_nums);
-    ctx->SetOutputDim("Child", common::make_ddim(output_dims));
-    ctx->SetOutputDim("LeafMask", common::make_ddim(output_dims));
-
-    if (ctx->GetOutputsVarType("Child")[0] ==
-        framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Child");
-      ctx->ShareLoD("X", /*->*/ "LeafMask");
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    tdm_child,
-    ops::TDMChildOp,
-    ops::TDMChildOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(tdm_child,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::TDMChildKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/tdm_child_op.h b/paddle/fluid/operators/tdm_child_op.h
deleted file mode 100644
index 3380062743047..0000000000000
--- a/paddle/fluid/operators/tdm_child_op.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include <fstream>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/common/flags.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/mixed_vector.h"
-
-namespace paddle {
-namespace operators {
-
-using DDim = framework::DDim;
-using LoD = framework::LoD;
-
-template <typename T, typename InfoT = int, typename OutT = int>
-void TDMChildInner(const framework::ExecutionContext &context,
-                   const phi::DenseTensor &input,
-                   const phi::DenseTensor &tree_info,
-                   phi::DenseTensor *child,
-                   phi::DenseTensor *mask) {
-  auto child_nums = context.Attr<int>("child_nums");
-  auto info_dims = tree_info.dims();
-  int node_nums = info_dims[0];
-  int length = info_dims[1];
-
-  int input_ids_num = input.numel();
-  VLOG(4) << "TDM child op: input numel ->  " << input_ids_num;
-
-  std::vector<OutT> child_vec{};
-  std::vector<OutT> item_mask_vec{};
-
-  auto *input_data = input.data<T>();
-  auto *tree_info_data = tree_info.data<InfoT>();
-
-  // TreeInfo: node_id : item_id; layer_id; ancestor_id; child_id
-  for (int input_ids = 0; input_ids < input_ids_num; ++input_ids) {
-    PADDLE_ENFORCE_LT(
-        input_data[input_ids],
-        node_nums,
-        phi::errors::InvalidArgument(
-            "input id of OP(paddle.incubate.layers.tdm_child) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            node_nums,
-            input_data[input_ids]));
-    PADDLE_ENFORCE_LE(
-        0,
-        input_data[input_ids],
-        phi::errors::InvalidArgument(
-            "input id of OP(paddle.incubate.layers.tdm_child) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            node_nums,
-            input_data[input_ids]));
-
-    bool has_child =
-        (input_data[input_ids] == 0 ||
-         tree_info_data[static_cast<int>(input_data[input_ids]) * length + 3] ==
-             0)
-            ? false
-            : true;
-
-    if (has_child) {
-      for (int child_ids = 0; child_ids < child_nums; ++child_ids) {
-        OutT child_id = static_cast<OutT>(
-            tree_info_data[static_cast<int>(input_data[input_ids]) * length +
-                           3 + child_ids]);
-        child_vec.push_back(child_id);
-        OutT child_is_item = static_cast<OutT>(
-            tree_info_data[static_cast<int>(child_id) * length] == 0 ? 0 : 1);
-        item_mask_vec.push_back(child_is_item);
-      }
-    } else {
-      for (int child_ids = 0; child_ids < child_nums; ++child_ids) {
-        child_vec.push_back(0);
-        item_mask_vec.push_back(0);
-      }
-    }
-  }
-
-  int output_nums = child_vec.size();
-  auto *child_data = child->mutable_data<OutT>(context.GetPlace());
-  auto *leaf_mask_data = mask->mutable_data<OutT>(context.GetPlace());
-
-  memcpy(child_data, &child_vec[0], sizeof(OutT) * output_nums);
-  memcpy(leaf_mask_data, &item_mask_vec[0], sizeof(OutT) * output_nums);
-}
-
-template <typename T, typename DeviceContext>
-class TDMChildKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *input_var = ctx.InputVar("X");
-    auto *tree_info_var = ctx.InputVar("TreeInfo");
-
-    auto &input_tensor = input_var->Get<phi::DenseTensor>();
-    const auto &input_type =
-        framework::TransToProtoVarType(input_tensor.dtype());
-    bool input_type_match = input_type == framework::proto::VarType::INT32 ||
-                            input_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(input_type_match,
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Input(X) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(input_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    auto &tree_info_tensor = tree_info_var->Get<phi::DenseTensor>();
-    const auto &info_type =
-        framework::TransToProtoVarType(tree_info_tensor.dtype());
-    bool info_type_match = info_type == framework::proto::VarType::INT32 ||
-                           info_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(
-        info_type_match,
-        true,
-        phi::errors::InvalidArgument(
-            "Input(TreeInfo) holds the wrong type, it holds %s, but "
-            "desires to be %s or %s",
-            paddle::framework::DataTypeToString(info_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64)));
-
-    auto *child_var = ctx.OutputVar("Child");
-    auto *leaf_mask_var = ctx.OutputVar("LeafMask");
-    auto *child_tensor = child_var->GetMutable<phi::DenseTensor>();
-    auto *leaf_mask_tensor = leaf_mask_var->GetMutable<phi::DenseTensor>();
-
-    auto output_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    bool out_type_match = output_type == framework::proto::VarType::INT32 ||
-                          output_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(out_type_match,
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Output(Child) & Output(LeafMask) holds the wrong "
-                          "type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(output_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    if (info_type == framework::proto::VarType::INT32 &&
-        output_type == framework::proto::VarType::INT32) {
-      TDMChildInner<T, int, int>(
-          ctx, input_tensor, tree_info_tensor, child_tensor, leaf_mask_tensor);
-    } else if (info_type == framework::proto::VarType::INT64 &&
-               output_type == framework::proto::VarType::INT32) {
-      TDMChildInner<T, int64_t, int>(
-          ctx, input_tensor, tree_info_tensor, child_tensor, leaf_mask_tensor);
-    } else if (info_type == framework::proto::VarType::INT32 &&
-               output_type == framework::proto::VarType::INT64) {
-      TDMChildInner<T, int, int64_t>(
-          ctx, input_tensor, tree_info_tensor, child_tensor, leaf_mask_tensor);
-    } else if (info_type == framework::proto::VarType::INT64 &&
-               output_type == framework::proto::VarType::INT64) {
-      TDMChildInner<T, int64_t, int64_t>(
-          ctx, input_tensor, tree_info_tensor, child_tensor, leaf_mask_tensor);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
deleted file mode 100644
index 29344b1ace0b0..0000000000000
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ /dev/null
@@ -1,263 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/teacher_student_sigmoid_loss_op.h"
-
-#include <memory>
-
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("X"), "Input", "X", "teacher_student_sigmoid_loss");
-    OP_INOUT_CHECK(ctx->HasInput("Label"),
-                   "Input",
-                   "Label",
-                   "teacher_student_sigmoid_loss");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Y"), "Output", "Y", "teacher_student_sigmoid_loss");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(x_dims.size(),
-                      2UL,
-                      phi::errors::InvalidArgument(
-                          "Input(X)'s rank should be 2. But received: "
-                          "Input(X)'s rank is [%d]",
-                          x_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        label_dims.size(),
-        2UL,
-        phi::errors::InvalidArgument("Input(Label)'s rank should be 2. But "
-                                     "received Input(Label)'s rank is [%d]",
-                                     label_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[0],
-          label_dims[0],
-          phi::errors::InvalidArgument(
-              "The 1st dimension of Input(X) and Input(Label) should "
-              "be equal. The difference is [%d]: [%d]",
-              x_dims[0],
-              label_dims[0]));
-      PADDLE_ENFORCE_EQ(
-          label_dims[1],
-          1UL,
-          phi::errors::InvalidArgument("The 2nd dimension of "
-                                       "Input(Label) should be 1. But received "
-                                       "Input(Label)'s 2nd dim is [%d]",
-                                       label_dims[1]));
-    }
-    ctx->SetOutputDim("Y", {x_dims[0], 1});
-    ctx->ShareLoD("X", /*->*/ "Y");
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of
-  // teacher_student_sigmoid_loss
-  // is determined by its input "X".
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class TeacherStudentSigmoidLossGradOpMaker
-    : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("teacher_student_sigmoid_loss_grad");
-
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Label", this->Input("Label"));
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-class TeacherStudentSigmoidLossGradientOp
-    : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("X"), "Input", "X", "teacher_student_sigmoid_loss_grad");
-    OP_INOUT_CHECK(ctx->HasInput("Label"),
-                   "Input",
-                   "X",
-                   "teacher_student_sigmoid_loss_grad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input",
-                   "Y@Grad",
-                   "teacher_student_sigmoid_loss_grad");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Input",
-                   "X@Grad",
-                   "teacher_student_sigmoid_loss_grad");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(),
-        2,
-        phi::errors::InvalidArgument(
-            "Input(X)'s rank should be 2. But received Input(X)'s rank is [%d]",
-            x_dims.size()));
-    PADDLE_ENFORCE_EQ(dy_dims.size(),
-                      2,
-                      phi::errors::InvalidArgument(
-                          "Input(Y@Grad)'s rank should be 2. But received "
-                          "Input(Y@Grad)'s rank is [%d]",
-                          dy_dims.size()));
-    PADDLE_ENFORCE_EQ(label_dims.size(),
-                      2,
-                      phi::errors::InvalidArgument(
-                          "Input(Label)'s rank should be 2. But received "
-                          "Input(Y@Grad)'s rank is [%d]",
-                          label_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[0],
-          label_dims[0],
-          phi::errors::InvalidArgument(
-              "The 1st dimension of Input(X) and Input(Label) should "
-              "be equal. The difference is [%d]: [%d]",
-              x_dims[0],
-              label_dims[0]));
-      PADDLE_ENFORCE_EQ(
-          x_dims[0],
-          dy_dims[0],
-          phi::errors::InvalidArgument(
-              "The 1st dimension of Input(X) and Input(Y@Grad) should "
-              "be equal. The difference is [%d]: [%d]",
-              x_dims[0],
-              dy_dims[0]));
-      PADDLE_ENFORCE_EQ(
-          dy_dims[1],
-          1,
-          phi::errors::InvalidArgument(
-              "The 2nd dimension of Input(Y@Grad) should be 1. "
-              "But received Input(Y@Grad)'s 2nd dimension is [%d]",
-              dy_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          label_dims[1],
-          1,
-          phi::errors::InvalidArgument(
-              "When Attr(soft_label) == false, the 2nd dimension of "
-              "Input(Label) should be 1. But received Input(Label)'s 2nd "
-              "dimension "
-              "is [%d]",
-              label_dims[1]));
-    }
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", framework::GradVarName("X"));
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of
-  // teacher_student_sigmoid_loss
-  // is determined by its input "X".
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class TeacherStudentSigmoidLossOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(phi::DenseTensor, default phi::DenseTensor<float>), a 2-D "
-             "tensor with shape [N x 1],"
-             " where N is the batch size and D is the output. "
-             "This input is a probability computed by the previous operator, "
-             "which is almost always the result of a softmax operator.");
-    AddInput("Label",
-             "(phi::DenseTensor), the ground truth which is a 2-D tensor. "
-             "Label is a phi::DenseTensor<float> with shape [N x 1]. ");
-    AddOutput("Y",
-              "(phi::DenseTensor, default phi::DenseTensor<float>), a 2-D "
-              "tensor with shape "
-              "[N x 1]. The teacher student sigmoid loss.");
-    AddAttr<float>(
-        "soft_max_up_bound",
-        "fp32, if input > soft_max_up_bound, input will be bound, default 15.0")
-        .SetDefault(15.0);
-    AddAttr<float>("soft_max_lower_bound",
-                   "fp32, if input < soft_max_lower_bound, input will be "
-                   "bound, default -15.0")
-        .SetDefault(-15.0);
-    AddComment(R"DOC(
-TeacherStudentSigmoidLoss Operator.
-
-It's similarity to SigmoidCrossEntropyWithLogits Operator. The difference is that
-we add another label(z') to original.
-        loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
-        z is click or not
-        z' is teacher value
-        label = {-2, -1, [0, 2]}
-        when z' is not exist, clk = 0 : label = -2;
-        when z' is not exist, clk = 1 : label = -1;
-        when z' is exist , clk = 0 : label = 0 + z';
-        when z' is exist    , clk = 1 : label = 1 + z';
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    teacher_student_sigmoid_loss,
-    ops::TeacherStudentSigmoidLossOp,
-    ops::TeacherStudentSigmoidLossOpMaker,
-    ops::TeacherStudentSigmoidLossGradOpMaker<paddle::framework::OpDesc>,
-    ops::TeacherStudentSigmoidLossGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(teacher_student_sigmoid_loss_grad,
-                  ops::TeacherStudentSigmoidLossGradientOp);
-
-PD_REGISTER_STRUCT_KERNEL(teacher_student_sigmoid_loss,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::TeacherStudentSigmoidLossOpKernel,
-                          float,
-                          double) {}
-PD_REGISTER_STRUCT_KERNEL(teacher_student_sigmoid_loss_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::TeacherStudentSigmoidLossGradOpKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
deleted file mode 100644
index 7ccb9438d4188..0000000000000
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    phi::DenseTensor* y = context.Output<phi::DenseTensor>("Y");
-    const phi::DenseTensor* x = context.Input<phi::DenseTensor>("X");
-    const phi::DenseTensor* labels = context.Input<phi::DenseTensor>("Label");
-    T* y_data = y->mutable_data<T>(context.GetPlace());
-    const T* x_data = x->data<T>();
-    const T* label_data = labels->data<T>();
-    int64_t batch_size = x->dims()[0];
-    // loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' +
-    // log(1 + exp(-abs(x)))
-    // z is click or not
-    // z' is value q of feed_fine
-    // label = {-2, -1, [0, 2]}
-    // when z' is not exist, clk = 0 : label = -2;
-    // when z' is not exist, clk = 1 : label = -1;
-    // when z' is exist    , clk = 0 : label = 0 + z';
-    // when z' is exist    , clk = 1 : label = 1 + z';
-    for (int i = 0; i < batch_size; ++i) {
-      if (label_data[i] < -1.0) {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      } else if (label_data[i] < 0.0) {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      } else if (label_data[i] < 1.0) {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
-                    log(1.0 + exp(-fabs(x_data[i]))) +
-                    (x_data[i] > 0 ? x_data[i] : 0.0) -
-                    x_data[i] * label_data[i] +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      } else {
-        y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
-                    log(1.0 + exp(-fabs(x_data[i]))) +
-                    (x_data[i] > 0 ? x_data[i] : 0.0) -
-                    x_data[i] * (label_data[i] - 1.0) +
-                    log(1.0 + exp(-fabs(x_data[i])));
-      }
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class TeacherStudentSigmoidLossGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const phi::DenseTensor* x = context.Input<phi::DenseTensor>("X");
-    const T* x_data = x->data<T>();
-
-    phi::DenseTensor* dx =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    T* dx_data = dx->mutable_data<T>(context.GetPlace());
-
-    const phi::DenseTensor* labels = context.Input<phi::DenseTensor>("Label");
-    const T* label_data = labels->data<T>();
-
-    T soft_max_up_bound =
-        static_cast<T>(context.Attr<float>("soft_max_up_bound"));
-    T soft_max_lower_bound =
-        static_cast<T>(context.Attr<float>("soft_max_lower_bound"));
-
-    int64_t batch_size = x->dims()[0];
-
-    const phi::DenseTensor* dOut =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-
-    const T* dout_data = dOut->data<T>();
-
-    for (int i = 0; i < batch_size; ++i) {
-      T sum_val = x_data[i];
-      if (sum_val > soft_max_up_bound) {
-        sum_val = soft_max_up_bound;
-      } else {
-        if (sum_val < soft_max_lower_bound) {
-          sum_val = soft_max_lower_bound;
-        }
-      }
-
-      T pred = 1.0 / (1.0 + exp(-sum_val));
-      if (label_data[i] < -1.0) {
-        dx_data[i] = 0.0 - pred;
-      } else if (label_data[i] < 0.0) {
-        dx_data[i] = 1.0 - pred;
-      } else {
-        dx_data[i] = label_data[i] - 2.0 * pred;
-      }
-      if (sum_val >= soft_max_up_bound || sum_val <= soft_max_lower_bound) {
-        dx_data[i] = 0;
-      }
-      dx_data[i] *= dout_data[i] * -1;
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/unique_with_counts_op.cc b/paddle/fluid/operators/unique_with_counts_op.cc
deleted file mode 100644
index 2e4af44ac8a1f..0000000000000
--- a/paddle/fluid/operators/unique_with_counts_op.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unique_with_counts_op.h"
-
-namespace paddle {
-namespace operators {
-
-class UniqueWithCountsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique_with_counts");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Out"), "Output", "Out", "unique_with_counts");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Index"), "Output", "Index", "unique_with_counts");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Count"), "Output", "Count", "unique_with_counts");
-
-    auto in_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(),
-        1,
-        phi::errors::InvalidArgument("The Input(X) should be 1-D Tensor, "
-                                     "But now the dims of Input(X) is %d.",
-                                     in_dims.size()));
-
-    ctx->SetOutputDim("Out", {-1});
-    ctx->SetOutputDim("Index", in_dims);
-    ctx->SetOutputDim("Count", {-1});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          platform::CPUPlace());
-  }
-};
-
-class UniqueWithCountsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input tensor. It should be a 1-D tensor.");
-    AddAttr<int>("dtype", "data type for output index");
-    AddOutput("Out", "A unique subsequence for input tensor.");
-    AddOutput("Index",
-              "An index tensor pointing to unique subsequence, which has "
-              "identical shape with input tensor and the data type is set by "
-              "the attr `dtype`");
-    AddOutput("Count", "A subsequence for the count of unique index");
-    AddComment(R"DOC(
-    Return a unique subsequence for 1-D input tensor, index tensor pointing to this unique subsequence,
-    and the subsequence for the count of unique index.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(unique_with_counts,
-                             ops::UniqueWithCountsOp,
-                             ops::UniqueWithCountsOpMaker);
-PD_REGISTER_STRUCT_KERNEL(unique_with_counts,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::UniqueWithCountsKernel,
-                          float,
-                          double,
-                          int32_t,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/unique_with_counts_op.h b/paddle/fluid/operators/unique_with_counts_op.h
deleted file mode 100644
index 4b1fef5e22447..0000000000000
--- a/paddle/fluid/operators/unique_with_counts_op.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cmath>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/unique_op.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class UniqueWithCountsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-    auto* x = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    auto* index = context.Output<phi::DenseTensor>("Index");
-    auto* count = context.Output<phi::DenseTensor>("Count");
-    framework::VisitDataType(data_type,
-                             UniqueOpFunctor<T>(out, index, x, count));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/unzip_op.cc b/paddle/fluid/operators/unzip_op.cc
deleted file mode 100644
index a72c0c6a878f3..0000000000000
--- a/paddle/fluid/operators/unzip_op.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unzip_op.h"
-
-#include <memory>
-
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class unzipOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lod");
-    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "lod");
-    auto lod_dims = ctx->GetInputDim("lod");
-    PADDLE_ENFORCE_EQ(
-        lod_dims.size(),
-        1UL,
-        phi::errors::InvalidArgument("Input(X)'s rank should be 1, but got %d",
-                                     lod_dims.size()));
-    auto len = static_cast<int64_t>(ctx->Attrs().Get<int>("len"));
-    ctx->SetOutputDim("Y", {lod_dims[0] - 1, len});
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of
-  // unzip
-  // is determined by its input "X".
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class unzipGradientOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unzipGradient");
-    OP_INOUT_CHECK(ctx->HasInput("lod"), "Input", "unzip", "unzipGradient");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input",
-                   framework::GradVarName("Y"),
-                   "unzipGradient");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   framework::GradVarName("X"),
-                   "unzipGradient");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto lod_dims = ctx->GetInputDim("lod");
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(),
-        2,
-        phi::errors::InvalidArgument("Expect Input(X)'s rank == 2, but got %d",
-                                     x_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        lod_dims.size(),
-        1,
-        phi::errors::InvalidArgument("Expect Input(X)'s rank == 1, but got %d",
-                                     lod_dims.size()));
-
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", framework::GradVarName("X"));
-  }
-
- protected:
-  // Explicitly set that the data type of computation kernel of
-  // unzip
-  // is determined by its input "X".
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Y")),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class unzipOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(LodTensor, default LodTensor<float>)");
-    AddInput("lod", "(Tensor),  a 1-D Tensor with shape [K]");
-    AddAttr<int>("len", "The len of each original Tensor").SetDefault(1);
-    AddOutput("Y",
-              "(LodTensor, default LodTensor<float>), a 2-D tensor with shape "
-              "[K-1 x len].");
-    AddComment(R"DOC(
-unzip Operator.
-)DOC");
-  }
-};
-
-template <typename T>
-class unzipGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("unzip_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("lod", this->Input("lod"));
-    op->SetAttr("len", this->GetAttr("len"));
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(unzip,
-                  ops::unzipOp,
-                  ops::unzipOpMaker,
-                  ops::unzipGradOpMaker<paddle::framework::OpDesc>,
-                  ops::unzipGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(unzip_grad, ops::unzipGradientOp);
-
-PD_REGISTER_STRUCT_KERNEL(unzip,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::unzipOpKernel,
-                          int64_t,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
-PD_REGISTER_STRUCT_KERNEL(unzip_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::unzipGradOpKernel,
-                          int64_t,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
diff --git a/paddle/fluid/operators/unzip_op.cu b/paddle/fluid/operators/unzip_op.cu
deleted file mode 100644
index 5be9bdea2b752..0000000000000
--- a/paddle/fluid/operators/unzip_op.cu
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/operators/unzip_op.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-namespace paddle {
-namespace operators {
-
-using phi::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T, typename LodType>
-__global__ void unzipKernel(
-    const T* X, const LodType* lod, T* Y, size_t col_size, size_t n) {
-  CUDA_KERNEL_LOOP(i, n) {
-    int lod_idx = i / col_size;
-    int len = lod[lod_idx + 1] - lod[lod_idx];
-    if (i >= lod_idx * col_size + len) {
-      Y[i] = 0;
-    } else {
-      Y[i] = X[lod[lod_idx] + i % col_size];
-    }
-  }
-}
-
-template <typename T, typename DeviceContext, typename LodType = int64_t>
-class unzipCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* x = context.Input<phi::DenseTensor>("X");
-    const T* x_data = x->data<T>();
-
-    const auto* lod = context.Input<phi::DenseTensor>("lod");
-    const LodType* lod_data = lod->data<LodType>();
-
-    auto col_size = context.Attr<int>("len");
-    auto row_size = lod->dims()[0] - 1;
-    auto y_numel = col_size * row_size;
-
-    auto* y = context.Output<phi::DenseTensor>("Y");
-    T* y_data = y->mutable_data<T>(context.GetPlace());
-
-    // for Input X do not have lod Information.
-    auto stream = context.template device_context<phi::GPUContext>().stream();
-    unzipKernel<<<(y_numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                      PADDLE_CUDA_NUM_THREADS,
-                  PADDLE_CUDA_NUM_THREADS,
-                  0,
-                  stream>>>(x_data, lod_data, y_data, col_size, y_numel);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class unzipGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_THROW(phi::errors::Unimplemented("unzip_grad is unimplemented"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-PD_REGISTER_STRUCT_KERNEL(unzip,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::unzipCUDAKernel,
-                          float,
-                          double,
-                          phi::dtype::float16,
-                          bool,
-                          int,
-                          int64_t,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
-PD_REGISTER_STRUCT_KERNEL(unzip_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::unzipGradCUDAKernel,
-                          float,
-                          double,
-                          phi::dtype::float16,
-                          bool,
-                          int,
-                          int64_t,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
diff --git a/paddle/fluid/operators/unzip_op.h b/paddle/fluid/operators/unzip_op.h
deleted file mode 100644
index 6829d00dccf56..0000000000000
--- a/paddle/fluid/operators/unzip_op.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class unzipOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_THROW(phi::errors::Unimplemented("unzip is unimplemented"));
-  }
-};
-
-template <typename T, typename DeviceContext>
-class unzipGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_THROW(phi::errors::Unimplemented("unzip_grad is unimplemented"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 1a083c30fcef9..e2de6183f01e6 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -804,10 +804,6 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::INT32,
                      phi::DataType::BOOL,
                      phi::DataType::FLOAT32})},
-      {"resnet_unit",
-       XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
-      {"resnet_unit_grad",
-       XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
       {"rmsprop", XPUKernelSet({phi::DataType::FLOAT32})},
       {"rnn", XPUKernelSet({phi::DataType::FLOAT32})},
       {"rnn_grad", XPUKernelSet({phi::DataType::FLOAT32})},
@@ -1194,8 +1190,6 @@ XPUOpMap& get_kl2_ops() {
       // Fused op
       {"squeeze_excitation_block",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
-      {"resnet_basic_block_grad", XPUKernelSet({phi::DataType::FLOAT32})},
-      {"resnet_basic_block", XPUKernelSet({phi::DataType::FLOAT32})},
       {"fused_gemm_epilogue",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"fused_gemm_epilogue_grad",
diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index c1d8c54c6b4b2..8232ae1317c6c 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -103,8 +103,6 @@ def _keep_fp32_input(op, in_name):
         return in_name != 'X'
     if op_type == 'fused_bn_add_activation':
         return in_name not in {'X', 'Z'}
-    if op_type == 'resnet_unit':
-        return in_name not in {'X', 'FilterX', 'Z', 'FilterZ'}
     if op_type in ['fused_attention', 'fused_feedforward']:
         return in_name in {
             'LnScale',
@@ -132,8 +130,6 @@ def _keep_fp32_output(op, out_name):
         return out_name != 'Y'
     if op_type == 'layer_norm' and _keep_layer_norm_scale_bias_to_fp32():
         return out_name != 'Y'
-    if op_type == 'resnet_unit':
-        return out_name not in {'Y', 'ConvX', 'ConvZ'}
     if op_type in ['fused_attention', 'fused_feedforward']:
         return out_name in {
             'LnMean',
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index e6e2dc766fc87..ff434a6fffc00 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -43,7 +43,6 @@
     LookAhead,
     ModelAverage,
 )
-from .passes import fuse_resnet_unit_pass  # noqa: F401
 from .tensor import (
     _npu_identity,  # noqa: F401
     segment_max,
diff --git a/python/paddle/incubate/layers/__init__.py b/python/paddle/incubate/layers/__init__.py
index f25a845d0a4dc..f3645718720db 100644
--- a/python/paddle/incubate/layers/__init__.py
+++ b/python/paddle/incubate/layers/__init__.py
@@ -27,10 +27,8 @@
     partial_concat,
     partial_sum,
     pow2_decay_with_linear_warmup,
-    rank_attention,
     search_pyramid_hash,
     shuffle_batch,
-    tdm_child,
     tdm_sampler,
 )
 
diff --git a/python/paddle/incubate/layers/nn.py b/python/paddle/incubate/layers/nn.py
index 5b6236567e649..78b6b6034ff24 100644
--- a/python/paddle/incubate/layers/nn.py
+++ b/python/paddle/incubate/layers/nn.py
@@ -646,91 +646,6 @@ def partial_sum(input, start_index=0, length=-1):
     return out
 
 
-def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
-    """
-    **Tdm Child**
-     According to the input node_id on the given tree, return the corresponding child node_id and
-      whether child is a leaf node by leaf_mask value.
-
-    .. code-block:: text
-
-        Given:
-            tree[[0], [1, 2], [3, 4], [5, 6]] # A binary tree with seven nodes
-            x = [[2], [3]]
-            node_nums = 7
-            child_nums = 2
-
-        We get:
-            child = [[5, 6],
-                     [0, 0]]
-            leaf_mask = [[1, 1],
-                         [0, 0]]
-
-    Args:
-        x (Tensor): Tensor contained the node_id information, dtype support int32/int64.
-        node_nums (int): Number of total nodes.
-        child_nums (int): Maximum number of child nodes per node.
-        param_attr (ParamAttr, optional): To specify the tdm-tree-info parameter property. Default: None, which means the
-            default weight parameter property is used. See usage for details in: ref: `api_paddle_ParamAttr`, should
-            has shape (node_nums, 3 + child_nums), dtype support int32/int64.
-            The dimension[1] of tdm-tree-info contains the following:
-            1. Item_id (int, shape(1)), if node is a leaf node, give its item_id corresponding to node_id, else give 0.
-            2. Layer_id (int, shape(1)), indicates which layer the node is on.
-            3. Parent_id (int, shape(1)), node's parent node.
-            4. Child_id (int, shape(child_nums)), all child node's node_id of this node should be given.
-            If the number of child nodes is insufficient, padding 0 until child nums equal to child_nums.
-        dtype (str, optional): The data type of output child and leaf_mask, support int32/int64. Default: int32.
-
-    Returns:
-        tuple: A tuple including input node's child(Tensor) and leaf_mask(Tensor).
-            If child is a leaf node, leaf_mask equal ot 1, otherwise equal to 0.
-
-    Examples:
-        .. code-block:: python
-
-            >>> import paddle
-            >>> import numpy as np
-            >>> paddle.enable_static()
-
-            >>> x = paddle.static.data(name="x", shape=[None, 1], dtype="int32", lod_level=1)
-            >>> tree_info = [[0,0,0,1,2],
-            ...             [0,1,0,3,4],[0,1,0,5,6],
-            ...             [0,2,1,0,0],[1,2,1,0,0],[2,2,2,0,0],[3,2,2,0,0]]
-            >>> tree_info_np = np.array(tree_info)
-            >>> tree_info_np = np.reshape(tree_info_np, (7,5))
-            >>> node_nums = 7
-            >>> child_nums = 2
-            >>> child, leaf_mask  = paddle.incubate.layers.tdm_child(x, node_nums, child_nums,
-            ...                     param_attr=paddle.ParamAttr(
-            ...                     initializer=paddle.nn.initializer.Assign(tree_info_np)))
-
-    """
-    helper = LayerHelper("tdm_child", **locals())
-    check_dtype(
-        dtype, 'dtype', ['int32', 'int64'], 'paddle.incubate.layers.tdm_child'
-    )
-    c_dtype = convert_np_dtype_to_dtype_(dtype)
-    tree_info = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=[node_nums, 3 + child_nums],
-        dtype=dtype,
-        default_initializer=paddle.nn.initializer.Constant(0),
-    )
-    tree_info.stop_gradient = True
-
-    child = helper.create_variable_for_type_inference(dtype=dtype)
-    leaf_mask = helper.create_variable_for_type_inference(dtype=dtype)
-
-    helper.append_op(
-        type='tdm_child',
-        inputs={'X': x, 'TreeInfo': tree_info},
-        outputs={'Child': child, 'LeafMask': leaf_mask},
-        attrs={'child_nums': child_nums, 'dtype': c_dtype},
-        stop_gradient=True,
-    )
-    return (child, leaf_mask)
-
-
 def tdm_sampler(
     x,
     neg_samples_num_list,
@@ -949,75 +864,6 @@ def tdm_sampler(
     return (out, labels, mask)
 
 
-def rank_attention(
-    input,
-    rank_offset,
-    rank_param_shape,
-    rank_param_attr,
-    max_rank=3,
-    max_size=0,
-):
-    """
-    **Rank Attention layer**
-    This Op can calculate rank attention between input and rank_param, and
-    rank_param gives the organization of data. Notice: It currently supports
-    GPU device.
-    This Op exists in incubate layers, which means that it is not shown to the public.
-
-    Args:
-        input (Tensor): Tensor with data type float32, float64.
-        rank_offset (Tensor): Tensor with data type int32.
-        rank_para_shape (list[int]): The shape of rank_param.
-        rank_param_attr (ParamAttr): Attribute initializer of rank_param.
-        max_rank (int, optional): The max rank of input's ranks. Default is 3.
-        max_size (int, optional): The max size of input's ranks. Default is 0.
-    Returns:
-        Tensor: A Tensor with the same data type as input's.
-
-    Examples:
-        .. code-block:: python
-
-            >>> import paddle
-            >>> paddle.enable_static()
-
-            >>> input = paddle.static.data(name="input", shape=[None, 2], dtype="float32")
-            >>> rank_offset = paddle.static.data(name="rank_offset", shape=[None, 7], dtype="int32")
-            >>> out = paddle.incubate.layers.rank_attention(input=input,
-            ...                                             rank_offset=rank_offset,
-            ...                                             rank_param_shape=[18,3],
-            ...                                             rank_param_attr=
-            ...                                             paddle.ParamAttr(learning_rate=1.0,
-            ...                                                              name="ubm_rank_param.w_0"),
-            ...                                             max_rank=3,
-            ...                                             max_size=0)
-    """
-    helper = LayerHelper('rank_attention', **locals())
-    dtype = helper.input_dtype(input_param_name='input')
-    input_shape = input.shape
-    assert input_shape[1] * max_rank * max_rank == rank_param_shape[0]
-
-    rank_param = helper.create_parameter(
-        attr=rank_param_attr, shape=rank_param_shape, dtype=dtype
-    )
-    rank_param.stop_gradient = False
-
-    output = helper.create_variable_for_type_inference(dtype)
-    input_help = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True
-    )
-    ins_rank = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True
-    )
-
-    helper.append_op(
-        type="rank_attention",
-        inputs={"X": input, "RankOffset": rank_offset, "RankParam": rank_param},
-        outputs={"Out": output, "InputHelp": input_help, "InsRank": ins_rank},
-        attrs={"MaxRank": max_rank, "MaxSize": max_size},
-    )
-    return output
-
-
 def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None):
     """
     **Batch FC layer**
diff --git a/python/paddle/incubate/operators/__init__.py b/python/paddle/incubate/operators/__init__.py
index 653dc97ed6193..df1c3a47d78a5 100644
--- a/python/paddle/incubate/operators/__init__.py
+++ b/python/paddle/incubate/operators/__init__.py
@@ -16,7 +16,6 @@
 from .graph_reindex import graph_reindex  # noqa: F401
 from .graph_sample_neighbors import graph_sample_neighbors  # noqa: F401
 from .graph_send_recv import graph_send_recv  # noqa: F401
-from .resnet_unit import ResNetUnit  # noqa: F401
 from .softmax_mask_fuse import softmax_mask_fuse  # noqa: F401
 from .softmax_mask_fuse_upper_triangle import (  # noqa: F401
     softmax_mask_fuse_upper_triangle,
diff --git a/python/paddle/incubate/operators/resnet_unit.py b/python/paddle/incubate/operators/resnet_unit.py
deleted file mode 100644
index af2faa4cac44a..0000000000000
--- a/python/paddle/incubate/operators/resnet_unit.py
+++ /dev/null
@@ -1,361 +0,0 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-import paddle
-from paddle import base
-from paddle.base.layer_helper import LayerHelper
-from paddle.base.param_attr import ParamAttr
-from paddle.nn import (
-    Layer,
-    initializer as I,
-)
-
-
-def resnet_unit(
-    x,
-    filter_x,
-    scale_x,
-    bias_x,
-    mean_x,
-    var_x,
-    z,
-    filter_z,
-    scale_z,
-    bias_z,
-    mean_z,
-    var_z,
-    stride,
-    stride_z,
-    padding,
-    dilation,
-    groups,
-    momentum,
-    eps,
-    data_format,
-    fuse_add,
-    has_shortcut,
-    use_global_stats,
-    is_test,
-    act,
-):
-    helper = LayerHelper('resnet_unit', **locals())
-    bn_param_dtype = base.core.VarDesc.VarType.FP32
-    bit_mask_dtype = base.core.VarDesc.VarType.INT32
-    out = helper.create_variable_for_type_inference(x.dtype)
-    bit_mask = helper.create_variable_for_type_inference(
-        dtype=bit_mask_dtype, stop_gradient=True
-    )
-    # intermediate_out for x
-    conv_x = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True
-    )
-    saved_mean_x = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True
-    )
-    saved_invstd_x = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True
-    )
-    running_mean_x = mean_x
-    running_var_x = var_x
-    # intermediate_out for z
-    conv_z = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True
-    )
-    saved_mean_z = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True
-    )
-    saved_invstd_z = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True
-    )
-    running_mean_z = (
-        helper.create_variable_for_type_inference(
-            dtype=bn_param_dtype, stop_gradient=True
-        )
-        if mean_z is None
-        else mean_z
-    )
-    running_var_z = (
-        helper.create_variable_for_type_inference(
-            dtype=bn_param_dtype, stop_gradient=True
-        )
-        if var_z is None
-        else var_z
-    )
-
-    inputs = {
-        'X': x,
-        'FilterX': filter_x,
-        'ScaleX': scale_x,
-        'BiasX': bias_x,
-        'MeanX': mean_x,
-        'VarX': var_x,
-        'Z': z,
-        'FilterZ': filter_z,
-        'ScaleZ': scale_z,
-        'BiasZ': bias_z,
-        'MeanZ': mean_z,
-        'VarZ': var_z,
-    }
-
-    attrs = {
-        'stride': stride,
-        'stride_z': stride_z,
-        'padding': padding,
-        'dilation': dilation,
-        'group': groups,
-        'momentum': momentum,
-        'epsilon': eps,
-        'data_format': data_format,
-        'fuse_add': fuse_add,
-        'has_shortcut': has_shortcut,
-        'use_global_stats': use_global_stats,
-        'is_test': is_test,
-        'act_type': act,
-    }
-
-    outputs = {
-        'Y': out,
-        'BitMask': bit_mask,
-        'ConvX': conv_x,
-        'SavedMeanX': saved_mean_x,
-        'SavedInvstdX': saved_invstd_x,
-        'RunningMeanX': running_mean_x,
-        'RunningVarX': running_var_x,
-        'ConvZ': conv_z,
-        'SavedMeanZ': saved_mean_z,
-        'SavedInvstdZ': saved_invstd_z,
-        'RunningMeanZ': running_mean_z,
-        'RunningVarZ': running_var_z,
-    }
-
-    helper.append_op(
-        type='resnet_unit', inputs=inputs, outputs=outputs, attrs=attrs
-    )
-
-    return out
-
-
-class ResNetUnit(Layer):
-    r"""
-    ******Temporary version******.
-    ResNetUnit is designed for optimize the performance by using cudnnv8 API.
-    """
-
-    def __init__(
-        self,
-        num_channels_x,
-        num_filters,
-        filter_size,
-        stride=1,
-        momentum=0.9,
-        eps=1e-5,
-        data_format='NHWC',
-        act='relu',
-        fuse_add=False,
-        has_shortcut=False,
-        use_global_stats=False,
-        is_test=False,
-        filter_x_attr=None,
-        scale_x_attr=None,
-        bias_x_attr=None,
-        moving_mean_x_name=None,
-        moving_var_x_name=None,
-        num_channels_z=1,
-        stride_z=1,
-        filter_z_attr=None,
-        scale_z_attr=None,
-        bias_z_attr=None,
-        moving_mean_z_name=None,
-        moving_var_z_name=None,
-    ):
-        super().__init__()
-        self._stride = stride
-        self._stride_z = stride_z
-        self._dilation = 1
-        self._kernel_size = paddle.utils.convert_to_list(
-            filter_size, 2, 'kernel_size'
-        )
-        self._padding = (filter_size - 1) // 2
-        self._groups = 1
-        self._momentum = momentum
-        self._eps = eps
-        self._data_format = data_format
-        self._act = act
-        self._fuse_add = fuse_add
-        self._has_shortcut = has_shortcut
-        self._use_global_stats = use_global_stats
-        self._is_test = is_test
-
-        # check format
-        valid_format = {'NHWC', 'NCHW'}
-        if data_format not in valid_format:
-            raise ValueError(
-                f"conv_format must be one of {valid_format}, but got conv_format='{data_format}'"
-            )
-
-        def _get_default_param_initializer(channels):
-            filter_elem_num = np.prod(self._kernel_size) * channels
-            std = (2.0 / filter_elem_num) ** 0.5
-            return I.Normal(0.0, std)
-
-        is_nchw = data_format == 'NCHW'
-        # initial filter
-        bn_param_dtype = base.core.VarDesc.VarType.FP32
-        if not is_nchw:
-            bn_param_shape = [1, 1, 1, num_filters]
-            filter_x_shape = [
-                num_filters,
-                filter_size,
-                filter_size,
-                num_channels_x,
-            ]
-            filter_z_shape = [
-                num_filters,
-                filter_size,
-                filter_size,
-                num_channels_z,
-            ]
-        else:
-            bn_param_shape = [1, num_filters, 1, 1]
-            filter_x_shape = [
-                num_filters,
-                num_channels_x,
-                filter_size,
-                filter_size,
-            ]
-            filter_z_shape = [
-                num_filters,
-                num_channels_z,
-                filter_size,
-                filter_size,
-            ]
-
-        self.filter_x = self.create_parameter(
-            shape=filter_x_shape,
-            attr=filter_x_attr,
-            default_initializer=_get_default_param_initializer(num_channels_x),
-        )
-        self.scale_x = self.create_parameter(
-            shape=bn_param_shape,
-            attr=scale_x_attr,
-            dtype=bn_param_dtype,
-            default_initializer=I.Constant(1.0),
-        )
-        self.bias_x = self.create_parameter(
-            shape=bn_param_shape,
-            attr=bias_x_attr,
-            dtype=bn_param_dtype,
-            is_bias=True,
-        )
-        self.mean_x = self.create_parameter(
-            attr=ParamAttr(
-                name=moving_mean_x_name,
-                initializer=I.Constant(0.0),
-                trainable=False,
-            ),
-            shape=bn_param_shape,
-            dtype=bn_param_dtype,
-        )
-        self.mean_x.stop_gradient = True
-        self.var_x = self.create_parameter(
-            attr=ParamAttr(
-                name=moving_var_x_name,
-                initializer=I.Constant(1.0),
-                trainable=False,
-            ),
-            shape=bn_param_shape,
-            dtype=bn_param_dtype,
-        )
-        self.var_x.stop_gradient = True
-        if has_shortcut:
-            self.filter_z = self.create_parameter(
-                shape=filter_z_shape,
-                attr=filter_z_attr,
-                default_initializer=_get_default_param_initializer(
-                    num_channels_z
-                ),
-            )
-            self.scale_z = self.create_parameter(
-                shape=bn_param_shape,
-                attr=scale_z_attr,
-                dtype=bn_param_dtype,
-                default_initializer=I.Constant(1.0),
-            )
-            self.bias_z = self.create_parameter(
-                shape=bn_param_shape,
-                attr=bias_z_attr,
-                dtype=bn_param_dtype,
-                is_bias=True,
-            )
-            self.mean_z = self.create_parameter(
-                attr=ParamAttr(
-                    name=moving_mean_z_name,
-                    initializer=I.Constant(0.0),
-                    trainable=False,
-                ),
-                shape=bn_param_shape,
-                dtype=bn_param_dtype,
-            )
-            self.mean_z.stop_gradient = True
-            self.var_z = self.create_parameter(
-                attr=ParamAttr(
-                    name=moving_var_z_name,
-                    initializer=I.Constant(1.0),
-                    trainable=False,
-                ),
-                shape=bn_param_shape,
-                dtype=bn_param_dtype,
-            )
-            self.var_z.stop_gradient = True
-        else:
-            self.filter_z = None
-            self.scale_z = None
-            self.bias_z = None
-            self.mean_z = None
-            self.var_z = None
-
-    def forward(self, x, z=None):
-        if self._fuse_add and z is None:
-            raise ValueError("z can not be None")
-
-        out = resnet_unit(
-            x,
-            self.filter_x,
-            self.scale_x,
-            self.bias_x,
-            self.mean_x,
-            self.var_x,
-            z,
-            self.filter_z,
-            self.scale_z,
-            self.bias_z,
-            self.mean_z,
-            self.var_z,
-            self._stride,
-            self._stride_z,
-            self._padding,
-            self._dilation,
-            self._groups,
-            self._momentum,
-            self._eps,
-            self._data_format,
-            self._fuse_add,
-            self._has_shortcut,
-            self._use_global_stats,
-            self._is_test,
-            self._act,
-        )
-        return out
diff --git a/python/paddle/incubate/passes/fuse_resnet_unit_pass.py b/python/paddle/incubate/passes/fuse_resnet_unit_pass.py
deleted file mode 100644
index 042e4dc7e85aa..0000000000000
--- a/python/paddle/incubate/passes/fuse_resnet_unit_pass.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.incubate.passes import ir
-
-
-def set_resnet_unit_attrs(resnet_unit, has_shortcut):
-    resnet_unit.SetAttr("fuse_add", False)
-    resnet_unit.SetAttr("act_type", "relu")
-    resnet_unit.SetAttr("has_shortcut", has_shortcut)
-    resnet_unit.SetAttr("data_format", 'NHWC')
-    resnet_unit.SetAttr("dilation", 1)
-    resnet_unit.Attr("stride").MappedPattern(
-        op="conv2d", name="strides", element_index=0
-    )
-    resnet_unit.Attr("padding").MappedPattern(
-        op="conv2d", name="paddings", element_index=0
-    )
-    resnet_unit.Attr("group").MappedPattern(op="conv2d", name="groups")
-    resnet_unit.Attr("op_device").MappedPattern(op="conv2d", name="op_device")
-    resnet_unit.Attr("op_namescope").MappedPattern(
-        op="conv2d", name="op_namescope"
-    )
-    resnet_unit.Attr("momentum").MappedPattern(op="batch_norm", name="momentum")
-    resnet_unit.Attr("epsilon").MappedPattern(op="batch_norm", name="epsilon")
-    resnet_unit.Attr("use_global_stats").MappedPattern(
-        op="batch_norm", name="use_global_stats"
-    )
-
-
-def set_resnet_unit_outputs(resnet_unit, meanX, varX, meanZ=None, varZ=None):
-    resnet_unit.SetOutputs(
-        RunningMeanX=meanX,
-        RunningVarX=varX,
-        RunningMeanZ=meanZ,
-        RunningVarZ=varZ,
-    )
-
-
-@ir.RegisterPass
-def fuse_resnet_unit():
-    def pattern_conv_bn(x, filter, scale, bias, mean, var):
-        filter.Attr("shape")[0].Mod(32).EQ(0)
-        filter.Attr("shape")[1].Mod(8).EQ(0)
-        filter.Attr("shape")[2].EQ(1)
-        filter.Attr("shape")[3].EQ(1)
-        conv2d = ir.PassDesc.OP.conv2d(Input=x, Filter=filter)
-        conv2d.SetAttr("data_format", 'NHWC')
-        bn = ir.PassDesc.OP.batch_norm(
-            X=conv2d, Bias=bias, Mean=mean, Scale=scale, Variance=var
-        )
-        return bn
-
-    def pattern_one_input(x, filter, scale, bias, mean, var):
-        bn = pattern_conv_bn(x, filter, scale, bias, mean, var)
-        relu = ir.PassDesc.OP.relu(X=bn.Output("Y"))
-        return relu
-
-    def replace_one_input(x, filter, scale, bias, mean, var):
-        resnet_unit = ir.PassDesc.OP.resnet_unit(
-            X=x, FilterX=filter, ScaleX=scale, BiasX=bias, MeanX=mean, VarX=var
-        )
-        set_resnet_unit_attrs(resnet_unit, False)
-        set_resnet_unit_outputs(resnet_unit, mean, var)
-        return resnet_unit.Output("Y")
-
-    def pattern_two_input(
-        x,
-        filterX,
-        scaleX,
-        biasX,
-        meanX,
-        varX,
-        z,
-        filterZ,
-        scaleZ,
-        biasZ,
-        meanZ,
-        varZ,
-    ):
-        bnX = pattern_conv_bn(x, filterX, scaleX, biasX, meanX, varX)
-        bnZ = pattern_conv_bn(z, filterZ, scaleZ, biasZ, meanZ, varZ)
-        ewadd = ir.PassDesc.OP.elementwise_add(
-            X=bnX.Output("Y"), Y=bnZ.Output("Y")
-        )
-        relu = ir.PassDesc.OP.relu(X=ewadd)
-        return relu
-
-    def replace_two_input(
-        x,
-        filterX,
-        scaleX,
-        biasX,
-        meanX,
-        varX,
-        z,
-        filterZ,
-        scaleZ,
-        biasZ,
-        meanZ,
-        varZ,
-    ):
-        resnet_unit = ir.PassDesc.OP.resnet_unit(
-            X=x,
-            FilterX=filterX,
-            ScaleX=scaleX,
-            BiasX=biasX,
-            MeanX=meanX,
-            VarX=varX,
-            Z=z,
-            FilterZ=filterZ,
-            ScaleZ=scaleZ,
-            BiasZ=biasZ,
-            MeanZ=meanZ,
-            VarZ=varZ,
-        )
-        set_resnet_unit_attrs(resnet_unit, True)
-        set_resnet_unit_outputs(resnet_unit, meanX, varX, meanZ, varZ)
-        return resnet_unit.Output("Y")
-
-    return (pattern_one_input, replace_one_input), (
-        pattern_two_input,
-        replace_two_input,
-    )
diff --git a/python/paddle/static/amp/fp16_lists.py b/python/paddle/static/amp/fp16_lists.py
index bec67fd7a7414..53f7d034ed193 100644
--- a/python/paddle/static/amp/fp16_lists.py
+++ b/python/paddle/static/amp/fp16_lists.py
@@ -127,7 +127,7 @@ def _get_unsupported_list(dtype):
 # The set of ops that support fp16 calculation and are considered numerically-
 # safe and performance-critical. These ops are always converted to fp16.
 
-_only_supported_fp16_list = {'resnet_unit', 'fused_bn_add_activation'}
+_only_supported_fp16_list = {'fused_bn_add_activation'}
 
 
 def _get_white_list(dtype):
diff --git a/python/paddle/static/amp/fp16_utils.py b/python/paddle/static/amp/fp16_utils.py
index f12f125462e48..5e6ddc6d7779d 100644
--- a/python/paddle/static/amp/fp16_utils.py
+++ b/python/paddle/static/amp/fp16_utils.py
@@ -132,8 +132,6 @@ def _keep_fp32_input(op, in_name):
         return in_name != 'X'
     if op_type == 'fused_bn_add_activation':
         return in_name not in {'X', 'Z'}
-    if op_type == 'resnet_unit':
-        return in_name not in {'X', 'FilterX', 'Z', 'FilterZ'}
     if op_type in ['fused_attention', 'fused_feedforward']:
         return in_name in {
             'LnScale',
@@ -154,8 +152,6 @@ def _keep_fp32_output(op, out_name):
         return out_name != 'Y'
     if op_type == 'layer_norm' and _keep_layer_norm_scale_bias_to_fp32():
         return out_name != 'Y'
-    if op_type == 'resnet_unit':
-        return out_name not in {'Y', 'ConvX', 'ConvZ'}
     if op_type in ['fused_attention', 'fused_feedforward']:
         return out_name in {
             'LnMean',
diff --git a/test/cpp/fluid/CMakeLists.txt b/test/cpp/fluid/CMakeLists.txt
index 948cbcc233dfe..088a4a9ec0789 100644
--- a/test/cpp/fluid/CMakeLists.txt
+++ b/test/cpp/fluid/CMakeLists.txt
@@ -7,20 +7,16 @@ if(WITH_CINN)
   add_subdirectory(cinn)
 endif()
 add_subdirectory(controlflow)
-add_subdirectory(detection)
+
 if(WITH_DLNNE)
   add_subdirectory(dlnne)
 endif()
 add_subdirectory(elementwise)
 add_subdirectory(fused)
-if(WITH_LITE)
-  add_subdirectory(lite)
-endif()
 add_subdirectory(math)
 if(WITH_ONEDNN)
   add_subdirectory(mkldnn)
 endif()
-add_subdirectory(nccl)
 if(WITH_PSCORE)
   add_subdirectory(pscore)
 endif()
diff --git a/test/cpp/fluid/detection/CMakeLists.txt b/test/cpp/fluid/detection/CMakeLists.txt
deleted file mode 100644
index 6a69241e7846e..0000000000000
--- a/test/cpp/fluid/detection/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-paddle_test(mask_util_test SRCS mask_util_test.cc)
-
-if(WITH_ONNXRUNTIME AND WIN32)
-  # Copy onnxruntime for some c++ test in Windows, since the test will
-  # be build only in CI, so suppose the generator in Windows is Ninja.
-  copy_onnx(mask_util_test)
-endif()
diff --git a/test/cpp/fluid/detection/mask_util_test.cc b/test/cpp/fluid/detection/mask_util_test.cc
deleted file mode 100644
index 274850c0a67dc..0000000000000
--- a/test/cpp/fluid/detection/mask_util_test.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/mask_util.h"
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/memory/memory.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void Compare(const T* a, const T* b, const int n) {
-  for (int i = 0; i < n; i++) {
-    EXPECT_EQ(a[i], b[i]);
-  }
-}
-
-TEST(MaskUtil, Poly2MaskTest) {
-  float polys[] = {// NOLINT
-                   1.97f,
-                   1.88f,
-                   5.81f,
-                   1.88f,
-                   1.69f,
-                   6.53f,
-                   5.94f,
-                   6.38f,
-                   1.97f,
-                   1.88f};
-  int h = 8, w = 8;
-  int k = 5;  // length(polys) / 2
-  // clang-format off
-  uint8_t expect_mask[] = { // NOLINT
-      0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 1, 1, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 1, 0, 0, 0, 0,
-      0, 0, 1, 1, 1, 0, 0, 0,
-      0, 0, 1, 1, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0
-  };
-  // clang-format on
-
-  // the ground-truth mask is computed by coco API:
-  //
-  // import pycocotools.mask as mask_util
-  // import numpy as np
-  // segm = [1.97, 1.88, 5.81, 1.88, 1.69, 6.53, 5.94, 6.38, 1.97, 1.88]
-  // rles = mask_util.frPyObjects([segm], im_h, im_w)
-  // mask = mask_util.decode(rles)
-  // print mask
-  platform::CPUPlace cpu;
-  auto allocation = memory::Alloc(cpu, sizeof(expect_mask));
-  uint8_t* mask = reinterpret_cast<uint8_t*>(allocation->ptr());
-  Poly2Mask(polys, k, h, w, mask);
-  Compare<uint8_t>(expect_mask, mask, h * w);
-}
-
-TEST(MaskUtil, Poly2BoxesTest) {
-  // clang-format off
-  std::vector<std::vector<std::vector<float>>> polys = {
-      {{1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f}},
-      {{2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}}
-  };
-  float expect_boxes[] = { // NOLINT
-      1.69f, 1.88f, 5.94f, 6.53f,
-      1.69f, 0.88f, 6.94f, 6.63f
-  };
-  // clang-format on
-
-  platform::CPUPlace cpu;
-  auto allocation = memory::Alloc(cpu, sizeof(expect_boxes));
-  float* boxes = reinterpret_cast<float*>(allocation->ptr());
-  Poly2Boxes(polys, boxes);
-  Compare<float>(expect_boxes, boxes, 8);
-}
-
-TEST(MaskUtil, Polys2MaskWrtBoxTest) {
-  // clang-format off
-  std::vector<std::vector<std::vector<float>>> polys = {{
-      {1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f},
-      {2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}}};
-  float expect_boxes[] = { // NOLINT
-      1.69f, 0.88f, 6.94f, 6.63f
-  };
-  uint8_t expect_mask[] = { // NOLINT
-      0, 0, 0, 0, 0, 0, 0, 0,
-      0, 1, 1, 1, 1, 1, 0, 0,
-      0, 0, 1, 1, 1, 0, 0, 0,
-      0, 0, 1, 1, 1, 0, 0, 0,
-      0, 0, 1, 1, 1, 0, 0, 0,
-      0, 1, 1, 1, 1, 1, 0, 0,
-      0, 1, 1, 1, 1, 1, 1, 0,
-      1, 1, 1, 1, 1, 1, 1, 1
-  };
-  // clang-format on
-
-  platform::CPUPlace cpu;
-  auto allocation = memory::Alloc(cpu, sizeof(expect_boxes));
-  float* boxes = reinterpret_cast<float*>(allocation->ptr());
-  Poly2Boxes(polys, boxes);
-  Compare<float>(expect_boxes, boxes, 4);
-
-  auto allocation_mask = memory::Alloc(cpu, sizeof(expect_mask));
-  uint8_t* mask = reinterpret_cast<uint8_t*>(allocation_mask->ptr());
-  int M = 8;
-  Polys2MaskWrtBox(polys[0], expect_boxes, M, mask);
-  Compare<uint8_t>(expect_mask, mask, M * M);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/test/cpp/fluid/lite/CMakeLists.txt b/test/cpp/fluid/lite/CMakeLists.txt
deleted file mode 100644
index 6533073258ff5..0000000000000
--- a/test/cpp/fluid/lite/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-paddle_test(test_lite_engine_op SRCS lite_engine_op_test.cc)
-
-if(WITH_ONNXRUNTIME AND WIN32)
-  # Copy onnxruntime for some c++ test in Windows, since the test will
-  # be build only in CI, so suppose the generator in Windows is Ninja.
-  copy_onnx(test_lite_engine_op)
-endif()
diff --git a/test/cpp/fluid/lite/lite_engine_op_test.cc b/test/cpp/fluid/lite/lite_engine_op_test.cc
deleted file mode 100644
index ca4dd444335d0..0000000000000
--- a/test/cpp/fluid/lite/lite_engine_op_test.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "paddle/fluid/operators/lite/lite_engine_op.h"
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/operators/lite/ut_helper.h"
-
-USE_NO_KERNEL_OP(lite_engine)
-
-using paddle::inference::lite::AddFetchListToBlockDesc;
-using paddle::inference::lite::AddTensorToBlockDesc;
-using paddle::inference::lite::CreateTensor;
-using paddle::inference::lite::serialize_params;
-namespace paddle {
-namespace operators {
-
-TEST(LiteEngineOp, engine_op) {
-  framework::ProgramDesc program;
-  auto* block_ = program.Proto()->mutable_blocks(0);
-  framework::BlockDesc block_desc(&program, block_);
-  auto* feed0 = block_desc.AppendOp();
-  feed0->SetType("feed");
-  feed0->SetInput("X", {"feed"});
-  feed0->SetOutput("Out", {"x"});
-  feed0->SetAttr("col", 0);
-  auto* feed1 = block_desc.AppendOp();
-  feed1->SetType("feed");
-  feed1->SetInput("X", {"feed"});
-  feed1->SetOutput("Out", {"y"});
-  feed1->SetAttr("col", 1);
-  LOG(INFO) << "create elementwise_add op";
-  auto* elt_add = block_desc.AppendOp();
-  elt_add->SetType("elementwise_add");
-  elt_add->SetInput("X", std::vector<std::string>({"x"}));
-  elt_add->SetInput("Y", std::vector<std::string>({"y"}));
-  elt_add->SetOutput("Out", std::vector<std::string>({"z"}));
-  elt_add->SetAttr("axis", -1);
-  LOG(INFO) << "create fetch op";
-  auto* fetch = block_desc.AppendOp();
-  fetch->SetType("fetch");
-  fetch->SetInput("X", std::vector<std::string>({"z"}));
-  fetch->SetOutput("Out", std::vector<std::string>({"out"}));
-  fetch->SetAttr("col", 0);
-  // Set inputs' variable shape in BlockDesc
-  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4}), true);
-  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({2, 4}), true);
-  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 4}), false);
-  AddFetchListToBlockDesc(block_, "out");
-  *block_->add_ops() = *feed1->Proto();
-  *block_->add_ops() = *feed0->Proto();
-  *block_->add_ops() = *elt_add->Proto();
-  *block_->add_ops() = *fetch->Proto();
-  framework::Scope scope;
-  platform::CPUPlace place;
-  phi::CPUContext ctx(place);
-  // Prepare variables.
-  CreateTensor(&scope, "x", std::vector<int64_t>({2, 4}));
-  CreateTensor(&scope, "y", std::vector<int64_t>({2, 4}));
-  CreateTensor(&scope, "out", std::vector<int64_t>({2, 4}));
-
-  ASSERT_EQ(block_->ops_size(), 4);
-
-  std::vector<std::string> repetitive_params{"x", "y"};
-  inference::lite::EngineConfig config;
-  config.valid_places = {
-#if defined(PADDLE_WITH_ARM)
-    paddle::lite_api::Place({TARGET(kARM), PRECISION(kFloat)}),
-#else
-    paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
-#endif
-    paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
-  };
-  serialize_params(&(config.param), &scope, repetitive_params);
-  config.model = program.Proto()->SerializeAsString();
-  LOG(INFO) << "create lite_engine desc";
-  framework::OpDesc engine_op_desc(nullptr);
-  engine_op_desc.SetType("lite_engine");
-  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x", "y"}));
-  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"out"}));
-  std::string engine_key = "engine_0";
-  engine_op_desc.SetAttr("engine_key", engine_key);
-  engine_op_desc.SetAttr("enable_int8", false);
-  engine_op_desc.SetAttr("use_gpu", true);
-  engine_op_desc.SetAttr("zero_copy", true);
-  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
-  // TODO(wilber): The ut is out of date, we need to a new lite subgraph test.
-  // inference::Singleton<inference::lite::EngineManager>::Global().Create(
-  //     engine_key, config);
-  // LOG(INFO) << "create engine op";
-  // auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
-  // LOG(INFO) << "engine_op " << engine_op.get();
-  // // Execute them.
-  // LOG(INFO) << "engine_op run";
-  // engine_op->Run(scope, place);
-  // LOG(INFO) << "done";
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/test/cpp/fluid/nccl/CMakeLists.txt b/test/cpp/fluid/nccl/CMakeLists.txt
deleted file mode 100644
index a8bd7b7f55634..0000000000000
--- a/test/cpp/fluid/nccl/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-if(NOT (WITH_NCCL OR WITH_RCCL))
-  return()
-endif()
-
-if(WITH_GPU AND NOT WIN32)
-  nv_test(
-    nccl_op_test
-    SRCS nccl_op_test.cu.cc
-    DEPS nccl_op gpu_info device_context)
-endif()
-
-if(WITH_ROCM AND NOT WIN32)
-  hip_test(
-    nccl_op_test
-    SRCS nccl_op_test.cu.cc
-    DEPS nccl_op gpu_info device_context)
-endif()
diff --git a/test/cpp/fluid/nccl/nccl_op_test.cu.cc b/test/cpp/fluid/nccl/nccl_op_test.cu.cc
deleted file mode 100644
index b8a47b9703165..0000000000000
--- a/test/cpp/fluid/nccl/nccl_op_test.cu.cc
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include <memory>
-#include <mutex>   // NOLINT
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-
-USE_NO_KERNEL_OP(ncclInit);
-USE_OP_ITSELF(ncclAllReduce);
-USE_OP_ITSELF(ncclReduce);
-USE_OP_ITSELF(ncclBcast);
-PD_DECLARE_KERNEL(ncclAllReduce, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(ncclReduce, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(ncclBcast, GPU, ALL_LAYOUT);
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-// test data amount
-const f::DDim kDims = {20, 20};
-
-// nccl op common tester, init communicator.
-class NCCLTester : public ::testing::Test {
- public:
-  void SetUp() override {
-    int count = p::GetGPUDeviceCount();
-    if (count <= 0) {
-      LOG(WARNING) << "Cannot test gpu nccl, because the CUDA device count is "
-                   << count;
-      exit(0);
-    }
-    for (int i = 0; i < count; ++i) {
-      gpu_list_.emplace_back(i);
-    }
-
-    p::CPUPlace cpu_place;
-    f::InitDevices();
-    pool_ptr_ = &p::DeviceContextPool::Instance();
-
-    NCCLInitOp();
-  }
-
-  void NCCLInitOp() {
-    paddle::platform::CPUPlace cpu_place;
-    std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
-
-    op1->SetType("ncclInit");
-    op1->SetInput("parallel_scopes", {"p_scopes"});
-    op1->SetOutput("Communicator", {"comm"});
-
-    auto *var = g_scope_.Var("comm");
-    var->GetMutable<p::Communicator>();
-
-    auto *scope_var = g_scope_.Var("p_scopes");
-    auto *p_scopes = scope_var->GetMutable<std::vector<f::Scope *>>();
-    (*p_scopes).resize(gpu_list_.size());
-
-    auto op = f::OpRegistry::CreateOp(*op1);
-    VLOG(1) << "invoke NCCLInitOp.";
-    op->Run(g_scope_, cpu_place);
-    VLOG(1) << "NCCLInitOp finished.";
-  }
-
-  int GetGPUData(int gpu_id) { return gpu_id + 42; }
-
-  template <class T>
-  void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
-    std::unique_lock<std::mutex> lk(mu_);
-    const f::OpDesc *op1 = &op_desc;
-
-    p::CUDAPlace place(gpu_id);
-    const auto &ctx = pool_ptr_->Get(place);
-
-    auto *send_tensor = scope->Var("st")->GetMutable<phi::DenseTensor>();
-    auto *recv_tensor = scope->Var("rt")->GetMutable<phi::DenseTensor>();
-
-    if (!send_tensor->numel()) {
-      send_tensor->mutable_data<T>(kDims, place);
-
-      std::vector<T> send_vector(common::product(kDims), GetGPUData(gpu_id));
-      paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
-      VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
-    }
-
-    lk.unlock();
-
-    PADDLE_ENFORCE_EQ(
-        send_tensor->numel(),
-        common::product(kDims),
-        paddle::platform::errors::InvalidArgument("Tensor numel not match!"));
-
-    auto op = f::OpRegistry::CreateOp(*op1);
-
-    VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
-    VLOG(1) << " send_tensor : " << send_tensor->numel()
-            << " recv_tensor : " << recv_tensor->numel();
-    op->Run(*scope, place);
-    VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
-  }
-
-  void testNcclReduceOp();
-  void testNcclAllReduceOp();
-  void testNcclBcastOp();
-
- public:
-  p::DeviceContextPool *pool_ptr_;
-  f::Scope g_scope_;
-  std::mutex mu_;
-  std::vector<int> gpu_list_;
-};
-
-void NCCLTester::testNcclAllReduceOp() {
-  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-  op2->SetType("ncclAllReduce");
-  op2->SetInput("X", {"st"});
-  op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"rt"});
-
-  std::vector<f::Scope *> dev_scopes;
-
-  std::vector<std::thread> ths;
-
-  for (size_t i = 0; i < gpu_list_.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope_.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>,
-                   this,
-                   gpu_list_[i],
-                   *op2.get(),
-                   dev_scopes[i]);
-    ths.emplace_back(std::move(th));
-  }
-
-  for (size_t i = 0; i < gpu_list_.size(); ++i) {
-    ths[i].join();
-  }
-
-  float expected_result = 0.0;
-  for (int gpu_id : gpu_list_) {
-    expected_result = expected_result + GetGPUData(gpu_id);
-  }
-
-  for (size_t i = 0; i < dev_scopes.size(); ++i) {
-    p::CPUPlace cpu_place;
-    p::CUDAPlace gpu_place(gpu_list_[i]);
-
-    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<phi::DenseTensor>();
-    auto *rt = recv_tensor.data<float>();
-    auto *result_tensor =
-        dev_scopes[i]->Var("ct")->GetMutable<phi::DenseTensor>();
-    result_tensor->Resize(kDims);
-    auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-    auto *dev_ctx = static_cast<phi::GPUContext *>(pool_ptr_->Get(gpu_place));
-    paddle::memory::Copy(cpu_place,
-                         ct,
-                         p::CUDAPlace(gpu_list_[i]),
-                         rt,
-                         recv_tensor.numel() * sizeof(float),
-                         dev_ctx->stream());
-    dev_ctx->Wait();
-
-    for (int64_t j = 0; j < common::product(kDims); ++j) {
-      ASSERT_NEAR(ct[j], expected_result, 1e-5);
-    }
-  }
-}
-
-void NCCLTester::testNcclReduceOp() {
-  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-  const int kRoot = 0;
-  op2->SetType("ncclReduce");
-  op2->SetInput("X", {"st"});
-  op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"rt"});
-  op2->SetAttr("root", kRoot);
-
-  std::vector<f::Scope *> dev_scopes;
-
-  std::vector<std::thread> ths;
-
-  for (size_t i = 0; i < gpu_list_.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope_.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>,
-                   this,
-                   gpu_list_[i],
-                   *op2.get(),
-                   dev_scopes[i]);
-    ths.emplace_back(std::move(th));
-  }
-
-  for (size_t i = 0; i < gpu_list_.size(); ++i) {
-    ths[i].join();
-  }
-
-  float expected_result = 0.0;
-  for (int gpu_id : gpu_list_) {
-    expected_result = expected_result + GetGPUData(gpu_id);
-  }
-
-  p::CPUPlace cpu_place;
-  p::CUDAPlace gpu_place(gpu_list_[kRoot]);
-
-  auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<phi::DenseTensor>();
-  auto *rt = recv_tensor.data<float>();
-  auto *result_tensor =
-      dev_scopes[kRoot]->Var("ct")->GetMutable<phi::DenseTensor>();
-  result_tensor->Resize(kDims);
-  auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-  auto *dev_ctx = static_cast<phi::GPUContext *>(pool_ptr_->Get(gpu_place));
-  paddle::memory::Copy(cpu_place,
-                       ct,
-                       p::CUDAPlace(gpu_list_[kRoot]),
-                       rt,
-                       recv_tensor.numel() * sizeof(float),
-                       dev_ctx->stream());
-  dev_ctx->Wait();
-
-  for (int64_t j = 0; j < common::product(kDims); ++j) {
-    ASSERT_NEAR(ct[j], expected_result, 1e-5);
-  }
-}
-
-void NCCLTester::testNcclBcastOp() {
-  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-  const int kRoot = 0;
-  op2->SetType("ncclBcast");
-  op2->SetInput("X", {"st"});
-  op2->SetInput("Communicator", {"comm"});
-  op2->SetOutput("Out", {"rt"});
-  op2->SetAttr("root", kRoot);
-
-  std::vector<f::Scope *> dev_scopes;
-
-  std::vector<std::thread> ths;
-
-  for (size_t i = 0; i < gpu_list_.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope_.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>,
-                   this,
-                   gpu_list_[i],
-                   *op2.get(),
-                   dev_scopes[i]);
-    ths.emplace_back(std::move(th));
-  }
-
-  for (size_t i = 0; i < gpu_list_.size(); ++i) {
-    ths[i].join();
-  }
-
-  const int idx = gpu_list_.size() - 1;
-  float result = GetGPUData(kRoot);
-
-  p::CPUPlace cpu_place;
-  p::CUDAPlace gpu_place(gpu_list_[idx]);
-
-  std::string rt_str = "rt";
-  if (idx == kRoot) {
-    rt_str = "st";
-  }
-  auto &recv_tensor = dev_scopes[idx]->FindVar(rt_str)->Get<phi::DenseTensor>();
-  auto *rt = recv_tensor.data<float>();
-  auto *result_tensor =
-      dev_scopes[idx]->Var("ct")->GetMutable<phi::DenseTensor>();
-  result_tensor->Resize(kDims);
-  auto *ct = result_tensor->mutable_data<float>(cpu_place);
-
-  auto *dev_ctx = static_cast<phi::GPUContext *>(pool_ptr_->Get(gpu_place));
-  paddle::memory::Copy(cpu_place,
-                       ct,
-                       p::CUDAPlace(gpu_list_[idx]),
-                       rt,
-                       recv_tensor.numel() * sizeof(float),
-                       dev_ctx->stream());
-  dev_ctx->Wait();
-
-  for (int64_t j = 0; j < common::product(kDims); ++j) {
-    ASSERT_NEAR(ct[j], result, 1e-5);
-  }
-}
-
-// ncclInitOp with desc
-TEST_F(NCCLTester, ncclInitOp) {}
-
-TEST_F(NCCLTester, ncclOp) {
-  // Serial execution is required for the same nccl comm.
-
-  testNcclReduceOp();
-
-  testNcclAllReduceOp();
-
-  testNcclBcastOp();
-}
diff --git a/test/cpp/inference/api/CMakeLists.txt b/test/cpp/inference/api/CMakeLists.txt
index 14392a60feaf7..14052463f2b0c 100644
--- a/test/cpp/inference/api/CMakeLists.txt
+++ b/test/cpp/inference/api/CMakeLists.txt
@@ -772,14 +772,6 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     inference_analysis_api_test_build(${INT8_OBJ_DETECT_TEST_APP}
                                       ${INT8_OBJ_DETECT_TEST_APP_SRC})
 
-    # mobilenet-ssd int8
-    set(INT8_MOBILENET_SSD_MODEL_DIR "${INT8_DATA_DIR}/mobilenet-ssd")
-    download_int8_data_without_verify(${INT8_MOBILENET_SSD_MODEL_DIR}
-                                      "mobilenet_ssd_int8_model.tar.gz")
-    inference_analysis_api_object_dection_int8_test_run(
-      test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP}
-      ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH})
-
     ### Lexcial analysis GRU model
     set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru")
     download_gru_data_without_verify("${GRU_PATH}" "GRU_eval_data.tar.gz")
@@ -1328,8 +1320,6 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
 
   if(WITH_ONEDNN)
     set_tests_properties(test_analyzer_int8_resnet50 PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_analyzer_int8_mobilenet_ssd PROPERTIES TIMEOUT
-                                                                     120)
     set_tests_properties(test_analyzer_quant_performance_benchmark
                          PROPERTIES TIMEOUT 120)
     set_tests_properties(test_analyzer_int8_mobilenetv2 PROPERTIES TIMEOUT 120)
diff --git a/test/ir/CMakeLists.txt b/test/ir/CMakeLists.txt
index 134783e11c35d..44af6ff2518a0 100644
--- a/test/ir/CMakeLists.txt
+++ b/test/ir/CMakeLists.txt
@@ -18,7 +18,6 @@ else()
     set_tests_properties(${target} PROPERTIES LABELS "RUN_TYPE=INFER")
   endforeach()
   add_subdirectory(pir)
-  set_tests_properties(test_fuse_resnet_unit PROPERTIES TIMEOUT 120)
   set_tests_properties(test_convert_to_mixed_precision PROPERTIES TIMEOUT 300)
 endif()
 
diff --git a/test/ir/test_fuse_resnet_unit.py b/test/ir/test_fuse_resnet_unit.py
deleted file mode 100644
index 7e5885116e087..0000000000000
--- a/test/ir/test_fuse_resnet_unit.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-import paddle
-import paddle.incubate
-from paddle.base import core
-
-paddle.enable_static()
-np.random.seed(0)
-
-
-@unittest.skipIf(
-    not paddle.is_compiled_with_cuda()
-    or paddle.get_cudnn_version() < 8000
-    or paddle.device.cuda.get_device_capability()[0] < 7
-    or paddle.device.cuda.get_device_capability()[0] >= 9,
-    "only support with cuda and cudnn version is at least 8.0 "
-    "and device's compute capability is at least 7.0 and less than 9.0",
-)
-class TestFuseResNetUnit(unittest.TestCase):
-    def test_fuse_resnet_unit(self):
-        place = paddle.CUDAPlace(0)
-        program = paddle.static.Program()
-        startup_program = paddle.static.Program()
-        with paddle.static.amp.fp16_guard():
-            with paddle.static.program_guard(program, startup_program):
-                x = paddle.static.data("x", [1, 64, 64, 8], dtype="float16")
-                conv2d = paddle.nn.Conv2D(
-                    8, 32, 1, bias_attr=False, data_format='NHWC'
-                )
-                batch_norm = paddle.nn.BatchNorm(
-                    32, act='relu', data_layout='NHWC'
-                )
-                out = batch_norm(conv2d(x))
-        graph = core.Graph(program.desc)
-        core.get_pass("fuse_resnet_unit").apply(graph)
-        after_program = paddle.base.framework.IrGraph(graph).to_program()
-        params = paddle.static.amp.cast_model_to_fp16(program)
-        after_params = paddle.static.amp.cast_model_to_fp16(after_program)
-        exe = paddle.static.Executor(place)
-        exe.run(startup_program)
-        paddle.static.amp.cast_parameters_to_fp16(
-            place, program, to_fp16_var_names=params
-        )
-        paddle.static.amp.cast_parameters_to_fp16(
-            place, after_program, to_fp16_var_names=after_params
-        )
-        feed = {"x": np.random.randn(1, 64, 64, 8).astype("float16")}
-        before_out = exe.run(program, feed=feed, fetch_list=[out.name])
-        after_out = exe.run(after_program, feed=feed, fetch_list=[out.name])
-        np.testing.assert_allclose(
-            before_out[0], after_out[0], rtol=1e-05, atol=0.005
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 63d84ece4aa98..a0ab822524de4 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -191,8 +191,6 @@ endif()
 
 if((NOT WITH_GPU) AND (NOT WITH_ROCM))
   list(REMOVE_ITEM TEST_OPS test_fused_conv2d_add_act_op)
-  list(REMOVE_ITEM TEST_OPS test_rank_attention_op)
-  # TODO(shenliang03): rank_attention_op support CPU device in future
   list(REMOVE_ITEM TEST_OPS test_batch_fc_op)
   # TODO(shenliang03): batch_fc_op support CPU device in future
   # TODO(Yancey1989): parallel dygraph support CPU device in future
@@ -251,7 +249,6 @@ if(APPLE)
   )
   # this op is not support on mac
   list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
-  list(REMOVE_ITEM TEST_OPS test_detection_map_op)
   list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
 endif()
 if(NOT WITH_MKLML)
@@ -525,7 +522,6 @@ set(TEST_OPS_WITH_GC
     test_fill_zeros_like2_op
     test_gather_op
     test_gather_nd_op
-    test_linear_chain_crf_op
     test_lod_reset_op
     test_lookup_table_op
     test_mean_op
@@ -1187,9 +1183,6 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
   set_tests_properties(test_pipeline_parallel PROPERTIES TIMEOUT 120)
   set_tests_properties(test_fleet_perf_test PROPERTIES TIMEOUT 120)
 endif()
-if(WITH_GPU OR WITH_ROCM)
-  set_tests_properties(test_rank_attention_op PROPERTIES TIMEOUT 120)
-endif()
 if(WITH_GPU AND NOT WIN32)
   set_tests_properties(test_fused_multi_transformer_int8_op PROPERTIES TIMEOUT
                                                                        60)
diff --git a/test/legacy_test/test_ctc_align.py b/test/legacy_test/test_ctc_align.py
deleted file mode 100644
index 699b176518be1..0000000000000
--- a/test/legacy_test/test_ctc_align.py
+++ /dev/null
@@ -1,232 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle
-
-
-def CTCAlign(input, lod, blank, merge_repeated, padding=0, input_length=None):
-    if input_length is None:
-        lod0 = lod[0]
-        result = []
-        cur_offset = 0
-        for i in range(len(lod0)):
-            prev_token = -1
-            for j in range(cur_offset, cur_offset + lod0[i]):
-                token = input[j][0]
-                if (token != blank) and not (
-                    merge_repeated and token == prev_token
-                ):
-                    result.append(token)
-                prev_token = token
-            cur_offset += lod0[i]
-        result = np.array(result).reshape([len(result), 1]).astype("int32")
-        if len(result) == 0:
-            result = np.array([[-1]])
-        return result
-    else:
-        result = [[] for i in range(len(input))]
-        output_length = []
-        for i in range(len(input)):
-            prev_token = -1
-            for j in range(input_length[i][0]):
-                token = input[i][j]
-                if (token != blank) and not (
-                    merge_repeated and token == prev_token
-                ):
-                    result[i].append(token)
-                prev_token = token
-            start = len(result[i])
-            output_length.append([start])
-            for j in range(start, len(input[i])):
-                result[i].append(padding)
-        result = (
-            np.array(result)
-            .reshape([len(input), len(input[0])])
-            .astype("int32")
-        )
-        output_length = (
-            np.array(output_length).reshape([len(input), 1]).astype("int32")
-        )
-
-    return result, output_length
-
-
-class TestCTCAlignOp(OpTest):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.input_lod = [[11, 7]]
-        self.blank = 0
-        self.merge_repeated = False
-        self.input = (
-            np.array([0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0])
-            .reshape([18, 1])
-            .astype("int32")
-        )
-
-    def setUp(self):
-        self.config()
-        output = CTCAlign(
-            self.input, self.input_lod, self.blank, self.merge_repeated
-        )
-
-        self.inputs = {
-            "Input": (self.input, self.input_lod),
-        }
-        self.outputs = {"Output": output}
-        self.attrs = {
-            "blank": self.blank,
-            "merge_repeated": self.merge_repeated,
-        }
-
-    def test_check_output(self):
-        # NODE(yjjiang11): This op will be deprecated.
-        self.check_output(check_dygraph=False)
-
-
-class TestCTCAlignOpCase1(TestCTCAlignOp):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.input_lod = [[11, 8]]
-        self.blank = 0
-        self.merge_repeated = True
-        self.input = (
-            np.array([0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0, 0])
-            .reshape([19, 1])
-            .astype("int32")
-        )
-
-
-class TestCTCAlignOpCase2(TestCTCAlignOp):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.input_lod = [[4]]
-        self.blank = 0
-        self.merge_repeated = True
-        self.input = np.array([0, 0, 0, 0]).reshape([4, 1]).astype("int32")
-
-
-class TestCTCAlignPaddingOp(OpTest):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.input_lod = []
-        self.blank = 0
-        self.padding_value = 0
-        self.merge_repeated = True
-        self.input = (
-            np.array(
-                [
-                    [0, 2, 4, 4, 0, 6, 3, 6, 6, 0, 0],
-                    [1, 1, 3, 0, 0, 4, 5, 6, 0, 0, 0],
-                ]
-            )
-            .reshape([2, 11])
-            .astype("int32")
-        )
-        self.input_length = np.array([[9], [8]]).reshape([2, 1]).astype("int32")
-
-    def setUp(self):
-        self.config()
-        output, output_length = CTCAlign(
-            self.input,
-            self.input_lod,
-            self.blank,
-            self.merge_repeated,
-            self.padding_value,
-            self.input_length,
-        )
-        self.inputs = {
-            "Input": (self.input, self.input_lod),
-            "InputLength": self.input_length,
-        }
-        self.outputs = {"Output": output, "OutputLength": output_length}
-        self.attrs = {
-            "blank": self.blank,
-            "merge_repeated": self.merge_repeated,
-            "padding_value": self.padding_value,
-        }
-
-    def test_check_output(self):
-        # NODE(yjjiang11): This op will be deprecated.
-        self.check_output(check_dygraph=False)
-
-
-class TestCTCAlignOpCase3(TestCTCAlignPaddingOp):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.blank = 0
-        self.input_lod = []
-        self.merge_repeated = True
-        self.padding_value = 0
-        self.input = (
-            np.array(
-                [[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0], [0, 7, 7, 7, 0, 0]]
-            )
-            .reshape([3, 6])
-            .astype("int32")
-        )
-        self.input_length = (
-            np.array([[6], [5], [4]]).reshape([3, 1]).astype("int32")
-        )
-
-
-class TestCTCAlignOpCase4(TestCTCAlignPaddingOp):
-    '''
-    # test tensor input which has attr input padding_value
-    '''
-
-    def config(self):
-        self.op_type = "ctc_align"
-        self.blank = 0
-        self.input_lod = []
-        self.merge_repeated = False
-        self.padding_value = 0
-        self.input = (
-            np.array(
-                [[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0], [0, 7, 7, 7, 0, 0]]
-            )
-            .reshape([3, 6])
-            .astype("int32")
-        )
-        self.input_length = (
-            np.array([[6], [5], [4]]).reshape([3, 1]).astype("int32")
-        )
-
-
-class TestCTCAlignOpCase5(TestCTCAlignPaddingOp):
-    def config(self):
-        self.op_type = "ctc_align"
-        self.blank = 0
-        self.input_lod = []
-        self.merge_repeated = False
-        self.padding_value = 1
-        self.input = (
-            np.array(
-                [[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0], [0, 7, 1, 7, 0, 0]]
-            )
-            .reshape([3, 6])
-            .astype("int32")
-        )
-        self.input_length = (
-            np.array([[6], [5], [4]]).reshape([3, 1]).astype("int32")
-        )
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_detection_map_op.py b/test/legacy_test/test_detection_map_op.py
deleted file mode 100644
index 376b9876cd46a..0000000000000
--- a/test/legacy_test/test_detection_map_op.py
+++ /dev/null
@@ -1,360 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import math
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestDetectionMAPOp(OpTest):
-    def set_data(self):
-        self.class_num = 4
-        self.init_test_case()
-        self.mAP = [self.calc_map(self.tf_pos, self.tf_pos_lod)]
-        self.label = np.array(self.label).astype('float32')
-        self.detect = np.array(self.detect).astype('float32')
-        self.mAP = np.array(self.mAP).astype('float32')
-
-        if len(self.class_pos_count) > 0:
-            self.class_pos_count = np.array(self.class_pos_count).astype(
-                'int32'
-            )
-            self.true_pos = np.array(self.true_pos).astype('float32')
-            self.false_pos = np.array(self.false_pos).astype('float32')
-            self.has_state = np.array([1]).astype('int32')
-
-            self.inputs = {
-                'Label': (self.label, self.label_lod),
-                'DetectRes': (self.detect, self.detect_lod),
-                'HasState': self.has_state,
-                'PosCount': self.class_pos_count,
-                'TruePos': (self.true_pos, self.true_pos_lod),
-                'FalsePos': (self.false_pos, self.false_pos_lod),
-            }
-        else:
-            self.inputs = {
-                'Label': (self.label, self.label_lod),
-                'DetectRes': (self.detect, self.detect_lod),
-            }
-
-        self.attrs = {
-            'overlap_threshold': self.overlap_threshold,
-            'evaluate_difficult': self.evaluate_difficult,
-            'ap_type': self.ap_type,
-            'class_num': self.class_num,
-        }
-
-        self.out_class_pos_count = np.array(self.out_class_pos_count).astype(
-            'int'
-        )
-        self.out_true_pos = np.array(self.out_true_pos).astype('float32')
-        self.out_false_pos = np.array(self.out_false_pos).astype('float32')
-
-        self.outputs = {
-            'MAP': self.mAP,
-            'AccumPosCount': self.out_class_pos_count,
-            'AccumTruePos': (self.out_true_pos, self.out_true_pos_lod),
-            'AccumFalsePos': (self.out_false_pos, self.out_false_pos_lod),
-        }
-
-    def init_test_case(self):
-        self.overlap_threshold = 0.3
-        self.evaluate_difficult = True
-        self.ap_type = "integral"
-
-        self.label_lod = [[2, 2]]
-        # label difficult xmin ymin xmax ymax
-        self.label = [
-            [1, 0, 0.1, 0.1, 0.3, 0.3],
-            [1, 1, 0.6, 0.6, 0.8, 0.8],
-            [2, 0, 0.3, 0.3, 0.6, 0.5],
-            [1, 0, 0.7, 0.1, 0.9, 0.3],
-        ]
-
-        # label score xmin ymin xmax ymax difficult
-        self.detect_lod = [[3, 4]]
-        self.detect = [
-            [1, 0.3, 0.1, 0.0, 0.4, 0.3],
-            [1, 0.7, 0.0, 0.1, 0.2, 0.3],
-            [1, 0.9, 0.7, 0.6, 0.8, 0.8],
-            [2, 0.8, 0.2, 0.1, 0.4, 0.4],
-            [2, 0.1, 0.4, 0.3, 0.7, 0.5],
-            [1, 0.2, 0.8, 0.1, 1.0, 0.3],
-            [3, 0.2, 0.8, 0.1, 1.0, 0.3],
-        ]
-
-        # label score true_pos false_pos
-        self.tf_pos_lod = [[3, 4]]
-        self.tf_pos = [
-            [1, 0.9, 1, 0],
-            [1, 0.7, 1, 0],
-            [1, 0.3, 0, 1],
-            [1, 0.2, 1, 0],
-            [2, 0.8, 0, 1],
-            [2, 0.1, 1, 0],
-            [3, 0.2, 0, 1],
-        ]
-
-        self.class_pos_count = []
-        self.true_pos_lod = [[]]
-        self.true_pos = [[]]
-        self.false_pos_lod = [[]]
-        self.false_pos = [[]]
-
-    def calc_map(self, tf_pos, tf_pos_lod):
-        mAP = 0.0
-        count = 0
-
-        def get_input_pos(
-            class_pos_count, true_pos, true_pos_lod, false_pos, false_pos_lod
-        ):
-            class_pos_count_dict = collections.Counter()
-            true_pos_dict = collections.defaultdict(list)
-            false_pos_dict = collections.defaultdict(list)
-            for i, count in enumerate(class_pos_count):
-                class_pos_count_dict[i] = count
-
-            cur_pos = 0
-            for i in range(len(true_pos_lod[0])):
-                start = cur_pos
-                cur_pos += true_pos_lod[0][i]
-                end = cur_pos
-                for j in range(start, end):
-                    true_pos_dict[i].append(true_pos[j])
-
-            cur_pos = 0
-            for i in range(len(false_pos_lod[0])):
-                start = cur_pos
-                cur_pos += false_pos_lod[0][i]
-                end = cur_pos
-                for j in range(start, end):
-                    false_pos_dict[i].append(false_pos[j])
-
-            return class_pos_count_dict, true_pos_dict, false_pos_dict
-
-        def get_output_pos(label_count, true_pos, false_pos):
-            label_number = self.class_num
-
-            out_class_pos_count = []
-            out_true_pos_lod = []
-            out_true_pos = []
-            out_false_pos_lod = []
-            out_false_pos = []
-
-            for i in range(label_number):
-                out_class_pos_count.append([label_count[i]])
-                true_pos_list = true_pos[i]
-                out_true_pos += true_pos_list
-                out_true_pos_lod.append(len(true_pos_list))
-                false_pos_list = false_pos[i]
-                out_false_pos += false_pos_list
-                out_false_pos_lod.append(len(false_pos_list))
-
-            return (
-                out_class_pos_count,
-                out_true_pos,
-                [out_true_pos_lod],
-                out_false_pos,
-                [out_false_pos_lod],
-            )
-
-        def get_accumulation(pos_list):
-            sorted_list = sorted(pos_list, key=lambda pos: pos[0], reverse=True)
-            sum = 0
-            accu_list = []
-            for score, count in sorted_list:
-                sum += count
-                accu_list.append(sum)
-            return accu_list
-
-        label_count, true_pos, false_pos = get_input_pos(
-            self.class_pos_count,
-            self.true_pos,
-            self.true_pos_lod,
-            self.false_pos,
-            self.false_pos_lod,
-        )
-        for v in self.label:
-            label = v[0]
-            difficult = False if len(v) == 5 else v[1]
-            if self.evaluate_difficult:
-                label_count[label] += 1
-            elif not difficult:
-                label_count[label] += 1
-
-        for label, score, tp, fp in tf_pos:
-            true_pos[label].append([score, tp])
-            false_pos[label].append([score, fp])
-
-        for label, label_pos_num in label_count.items():
-            if label_pos_num == 0:
-                continue
-            if label not in true_pos:
-                count += 1
-                continue
-            label_true_pos = true_pos[label]
-            label_false_pos = false_pos[label]
-
-            accu_tp_sum = get_accumulation(label_true_pos)
-            accu_fp_sum = get_accumulation(label_false_pos)
-
-            precision = []
-            recall = []
-
-            for i in range(len(accu_tp_sum)):
-                precision.append(
-                    float(accu_tp_sum[i])
-                    / float(accu_tp_sum[i] + accu_fp_sum[i])
-                )
-                recall.append(float(accu_tp_sum[i]) / label_pos_num)
-
-            if self.ap_type == "11point":
-                max_precisions = [0.0] * 11
-                start_idx = len(accu_tp_sum) - 1
-                for j in range(10, -1, -1):
-                    for i in range(start_idx, -1, -1):
-                        if recall[i] < float(j) / 10.0:
-                            start_idx = i
-                            if j > 0:
-                                max_precisions[j - 1] = max_precisions[j]
-                                break
-                        else:
-                            if max_precisions[j] < precision[i]:
-                                max_precisions[j] = precision[i]
-                for j in range(10, -1, -1):
-                    mAP += max_precisions[j] / 11
-                count += 1
-            elif self.ap_type == "integral":
-                average_precisions = 0.0
-                prev_recall = 0.0
-                for i in range(len(accu_tp_sum)):
-                    if math.fabs(recall[i] - prev_recall) > 1e-6:
-                        average_precisions += precision[i] * math.fabs(
-                            recall[i] - prev_recall
-                        )
-                        prev_recall = recall[i]
-
-                mAP += average_precisions
-                count += 1
-        pcnt, tp, tp_lod, fp, fp_lod = get_output_pos(
-            label_count, true_pos, false_pos
-        )
-        self.out_class_pos_count = pcnt
-        self.out_true_pos = tp
-        self.out_true_pos_lod = tp_lod
-        self.out_false_pos = fp
-        self.out_false_pos_lod = fp_lod
-        if count != 0:
-            mAP /= count
-        return mAP
-
-    def setUp(self):
-        self.op_type = "detection_map"
-        self.set_data()
-
-    def test_check_output(self):
-        # NODE(yjjiang11): This op will be deprecated.
-        self.check_output(check_dygraph=False)
-
-
-class TestDetectionMAPOpSkipDiff(TestDetectionMAPOp):
-    def init_test_case(self):
-        super().init_test_case()
-
-        self.evaluate_difficult = False
-
-        self.tf_pos_lod = [[2, 4]]
-        # label score true_pos false_pos
-        self.tf_pos = [
-            [1, 0.7, 1, 0],
-            [1, 0.3, 0, 1],
-            [1, 0.2, 1, 0],
-            [2, 0.8, 0, 1],
-            [2, 0.1, 1, 0],
-            [3, 0.2, 0, 1],
-        ]
-
-
-class TestDetectionMAPOpWithoutDiff(TestDetectionMAPOp):
-    def init_test_case(self):
-        super().init_test_case()
-
-        # label xmin ymin xmax ymax
-        self.label = [
-            [1, 0.1, 0.1, 0.3, 0.3],
-            [1, 0.6, 0.6, 0.8, 0.8],
-            [2, 0.3, 0.3, 0.6, 0.5],
-            [1, 0.7, 0.1, 0.9, 0.3],
-        ]
-
-
-class TestDetectionMAPOp11Point(TestDetectionMAPOp):
-    def init_test_case(self):
-        super().init_test_case()
-
-        self.ap_type = "11point"
-
-
-class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
-    def init_test_case(self):
-        super().init_test_case()
-        self.class_pos_count = [0, 2, 1, 0]
-        self.true_pos_lod = [[0, 3, 2]]
-        self.true_pos = [
-            [0.7, 1.0],
-            [0.3, 0.0],
-            [0.2, 1.0],
-            [0.8, 0.0],
-            [0.1, 1.0],
-        ]
-        self.false_pos_lod = [[0, 3, 2]]
-        self.false_pos = [
-            [0.7, 0.0],
-            [0.3, 1.0],
-            [0.2, 0.0],
-            [0.8, 1.0],
-            [0.1, 0.0],
-        ]
-
-
-class TestDetectionMAPOp11PointWithClassNoTP(TestDetectionMAPOp):
-    def init_test_case(self):
-        self.overlap_threshold = 0.3
-        self.evaluate_difficult = True
-        self.ap_type = "11point"
-
-        self.label_lod = [[2]]
-        # label difficult xmin ymin xmax ymax
-        self.label = [[2, 0, 0.3, 0.3, 0.6, 0.5], [1, 0, 0.7, 0.1, 0.9, 0.3]]
-
-        # label score xmin ymin xmax ymax difficult
-        self.detect_lod = [[1]]
-        self.detect = [[1, 0.2, 0.8, 0.1, 1.0, 0.3]]
-
-        # label score true_pos false_pos
-        self.tf_pos_lod = [[3, 4]]
-        self.tf_pos = [[1, 0.2, 1, 0]]
-
-        self.class_pos_count = []
-        self.true_pos_lod = [[]]
-        self.true_pos = [[]]
-        self.false_pos_lod = [[]]
-        self.false_pos = [[]]
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_generate_mask_labels_op.py b/test/legacy_test/test_generate_mask_labels_op.py
deleted file mode 100644
index 86ab3cb088879..0000000000000
--- a/test/legacy_test/test_generate_mask_labels_op.py
+++ /dev/null
@@ -1,317 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import unittest
-
-import numpy as np
-
-'''
-# Equivalent code
-rles = mask_util.frPyObjects([segm], im_h, im_w)
-mask = mask_util.decode(rles)
-'''
-
-
-def decode(cnts, m):
-    v = 0
-    mask = []
-    for j in range(m):
-        for k in range(cnts[j]):
-            mask.append(v)
-        v = 1 - v
-    return mask
-
-
-def poly2mask(xy, k, h, w):
-    scale = 5.0
-    x = [int(scale * p + 0.5) for p in xy[::2]]
-    x = x + [x[0]]
-    y = [int(scale * p + 0.5) for p in xy[1::2]]
-    y = y + [y[0]]
-    m = sum(
-        [
-            int(max(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1]))) + 1
-            for j in range(k)
-        ]
-    )
-
-    u, v = [], []
-    for j in range(k):
-        xs = x[j]
-        xe = x[j + 1]
-        ys = y[j]
-        ye = y[j + 1]
-        dx = abs(xe - xs)
-        dy = abs(ys - ye)
-        flip = (dx >= dy and xs > xe) or (dx < dy and ys > ye)
-        if flip:
-            xs, xe = xe, xs
-            ys, ye = ye, ys
-
-        if dx >= dy:
-            if dx == 0:
-                assert ye - ys == 0
-            s = 0 if dx == 0 else float(ye - ys) / dx
-        else:
-            if dy == 0:
-                assert xe - xs == 0
-            s = 0 if dy == 0 else float(xe - xs) / dy
-
-        if dx >= dy:
-            ts = [dx - d if flip else d for d in range(dx + 1)]
-            u.extend([xs + t for t in ts])
-            v.extend([int(ys + s * t + 0.5) for t in ts])
-        else:
-            ts = [dy - d if flip else d for d in range(dy + 1)]
-            v.extend([t + ys for t in ts])
-            u.extend([int(xs + s * t + 0.5) for t in ts])
-
-    k = len(u)
-    x = np.zeros((k), np.int_)
-    y = np.zeros((k), np.int_)
-    m = 0
-    for j in range(1, k):
-        if u[j] != u[j - 1]:
-            xd = float(u[j] if (u[j] < u[j - 1]) else (u[j] - 1))
-            xd = (xd + 0.5) / scale - 0.5
-            if math.floor(xd) != xd or xd < 0 or xd > (w - 1):
-                continue
-            yd = float(v[j] if v[j] < v[j - 1] else v[j - 1])
-            yd = (yd + 0.5) / scale - 0.5
-            yd = math.ceil(0 if yd < 0 else (h if yd > h else yd))
-            x[m] = int(xd)
-            y[m] = int(yd)
-            m += 1
-    k = m
-    a = [int(x[i] * h + y[i]) for i in range(k)]
-    a.append(h * w)
-    a.sort()
-    b = [0] + a[: len(a) - 1]
-    a = [c - d for (c, d) in zip(a, b)]
-
-    k += 1
-    b = [0 for i in range(k)]
-    b[0] = a[0]
-    m, j = 1, 1
-    while j < k:
-        if a[j] > 0:
-            b[m] = a[j]
-            m += 1
-            j += 1
-        else:
-            j += 1
-            if j < k:
-                b[m - 1] += a[j]
-                j += 1
-    mask = decode(b, m)
-    mask = np.array(mask, dtype=np.int_).reshape((w, h))
-    mask = mask.transpose((1, 0))
-    return mask
-
-
-def polys_to_boxes(polys):
-    """Convert a list of polygons into an array of tight bounding boxes."""
-    boxes_from_polys = np.zeros((len(polys), 4), dtype=np.float32)
-    for i in range(len(polys)):
-        poly = polys[i]
-        x0 = min(min(p[::2]) for p in poly)
-        x1 = max(max(p[::2]) for p in poly)
-        y0 = min(min(p[1::2]) for p in poly)
-        y1 = max(max(p[1::2]) for p in poly)
-        boxes_from_polys[i, :] = [x0, y0, x1, y1]
-    return boxes_from_polys
-
-
-def bbox_overlaps(boxes, query_boxes):
-    N = boxes.shape[0]
-    K = query_boxes.shape[0]
-    overlaps = np.zeros((N, K), dtype=boxes.dtype)
-    for k in range(K):
-        box_area = (query_boxes[k, 2] - query_boxes[k, 0] + 1) * (
-            query_boxes[k, 3] - query_boxes[k, 1] + 1
-        )
-        for n in range(N):
-            iw = (
-                min(boxes[n, 2], query_boxes[k, 2])
-                - max(boxes[n, 0], query_boxes[k, 0])
-                + 1
-            )
-            if iw > 0:
-                ih = (
-                    min(boxes[n, 3], query_boxes[k, 3])
-                    - max(boxes[n, 1], query_boxes[k, 1])
-                    + 1
-                )
-                if ih > 0:
-                    ua = float(
-                        (boxes[n, 2] - boxes[n, 0] + 1)
-                        * (boxes[n, 3] - boxes[n, 1] + 1)
-                        + box_area
-                        - iw * ih
-                    )
-                    overlaps[n, k] = iw * ih / ua
-    return overlaps
-
-
-def polys_to_mask_wrt_box(polygons, box, M):
-    """Convert from the COCO polygon segmentation format to a binary mask
-    encoded as a 2D array of data type numpy.float32. The polygon segmentation
-    is understood to be enclosed in the given box and rasterized to an M x M
-    mask. The resulting mask is therefore of shape (M, M).
-    """
-    w = box[2] - box[0]
-    h = box[3] - box[1]
-
-    w = np.maximum(w, 1)
-    h = np.maximum(h, 1)
-
-    polygons_norm = []
-    for poly in polygons:
-        p = np.array(poly, dtype=np.float32)
-        p[0::2] = (p[0::2] - box[0]) * M / w
-        p[1::2] = (p[1::2] - box[1]) * M / h
-        polygons_norm.append(p)
-
-    mask = []
-    for polygons in polygons_norm:
-        assert polygons.shape[0] % 2 == 0
-        k = polygons.shape[0] // 2
-        mask.append(poly2mask(polygons, k, M, M))
-    mask = np.array(mask)
-    # Flatten in case polygons was a list
-    mask = np.sum(mask, axis=0)
-    mask = np.array(mask > 0, dtype=np.float32)
-    return mask
-
-
-def expand_mask_targets(masks, mask_class_labels, resolution, num_classes):
-    """Expand masks from shape (#masks, resolution ** 2)
-    to (#masks, #classes * resolution ** 2) to encode class
-    specific mask targets.
-    """
-    assert masks.shape[0] == mask_class_labels.shape[0]
-
-    # Target values of -1 are "don't care" / ignore labels
-    mask_targets = -np.ones(
-        (masks.shape[0], num_classes * resolution**2), dtype=np.int32
-    )
-    for i in range(masks.shape[0]):
-        cls = int(mask_class_labels[i])
-        start = resolution**2 * cls
-        end = start + resolution**2
-        # Ignore background instance
-        # (only happens when there is no fg samples in an image)
-        if cls > 0:
-            mask_targets[i, start:end] = masks[i, :]
-    return mask_targets
-
-
-def generate_mask_labels(
-    num_classes,
-    im_info,
-    gt_classes,
-    is_crowd,
-    label_int32,
-    gt_polys,
-    resolution,
-    rois,
-    roi_lod,
-    gt_lod,
-):
-    mask_rois = []
-    roi_has_mask_int32 = []
-    mask_int32 = []
-    new_lod = []
-    for i in range(len(im_info)):
-        roi_s = roi_lod[i]
-        roi_e = roi_lod[i + 1]
-        gt_s = gt_lod[i]
-        gt_e = gt_lod[i + 1]
-        mask_blob = _sample_mask(
-            num_classes,
-            im_info[i],
-            gt_classes[gt_s:gt_e],
-            is_crowd[gt_s:gt_e],
-            label_int32[roi_s:roi_e],
-            gt_polys[i],
-            resolution,
-            rois[roi_s:roi_e],
-        )
-        new_lod.append(mask_blob['mask_rois'].shape[0])
-        mask_rois.append(mask_blob['mask_rois'])
-        roi_has_mask_int32.append(mask_blob['roi_has_mask_int32'])
-        mask_int32.append(mask_blob['mask_int32'])
-    return mask_rois, roi_has_mask_int32, mask_int32, new_lod
-
-
-def _sample_mask(
-    num_classes,
-    im_info,
-    gt_classes,
-    is_crowd,
-    label_int32,
-    gt_polys,  # [[[], []], []]
-    resolution,
-    rois,
-):
-    mask_blob = {}
-    im_scale = im_info[2]
-    sample_boxes = rois
-    polys_gt_inds = np.where((gt_classes > 0) & (is_crowd == 0))[0]
-    polys_gt = [gt_polys[i] for i in polys_gt_inds]
-    boxes_from_polys = polys_to_boxes(polys_gt)
-
-    fg_inds = np.where(label_int32 > 0)[0]
-    roi_has_mask = fg_inds.copy()
-    if fg_inds.shape[0] > 0:
-        mask_class_labels = label_int32[fg_inds]
-        masks = np.zeros((fg_inds.shape[0], resolution**2), dtype=np.int32)
-        rois_fg = sample_boxes[fg_inds]
-        overlaps_bbfg_bbpolys = bbox_overlaps(
-            rois_fg.astype(np.float32), boxes_from_polys.astype(np.float32)
-        )
-        fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)
-        for i in range(rois_fg.shape[0]):
-            fg_polys_ind = fg_polys_inds[i]
-            poly_gt = polys_gt[fg_polys_ind]
-            roi_fg = rois_fg[i]
-            mask = polys_to_mask_wrt_box(poly_gt, roi_fg, resolution)
-            mask = np.array(mask > 0, dtype=np.int32)
-            masks[i, :] = np.reshape(mask, resolution**2)
-    else:
-        bg_inds = np.where(label_int32 == 0)[0]
-        rois_fg = sample_boxes[bg_inds[0]].reshape((1, -1))
-        masks = -np.ones((1, resolution**2), dtype=np.int32)
-        mask_class_labels = np.zeros((1,))
-        roi_has_mask = np.append(roi_has_mask, 0)
-    masks = expand_mask_targets(
-        masks, mask_class_labels, resolution, num_classes
-    )
-    rois_fg *= im_scale
-    mask_blob['mask_rois'] = rois_fg
-    mask_blob['roi_has_mask_int32'] = roi_has_mask
-    mask_blob['mask_int32'] = masks
-    return mask_blob
-
-
-def trans_lod(lod):
-    new_lod = [0]
-    for i in range(len(lod)):
-        new_lod.append(lod[i] + new_lod[i])
-    return new_lod
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_generate_proposal_labels_op.py b/test/legacy_test/test_generate_proposal_labels_op.py
deleted file mode 100644
index 903201b9856a7..0000000000000
--- a/test/legacy_test/test_generate_proposal_labels_op.py
+++ /dev/null
@@ -1,553 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-def generate_proposal_labels_in_python(
-    rpn_rois,
-    gt_classes,
-    is_crowd,
-    gt_boxes,
-    im_info,
-    batch_size_per_im,
-    fg_fraction,
-    fg_thresh,
-    bg_thresh_hi,
-    bg_thresh_lo,
-    bbox_reg_weights,
-    class_nums,
-    use_random,
-    is_cls_agnostic,
-    is_cascade_rcnn,
-    max_overlaps=None,
-):
-    rois = []
-    labels_int32 = []
-    bbox_targets = []
-    bbox_inside_weights = []
-    bbox_outside_weights = []
-    max_overlap_with_gt = []
-    lod = []
-    assert len(rpn_rois) == len(
-        im_info
-    ), 'batch size of rpn_rois and ground_truth is not matched'
-
-    for im_i in range(len(im_info)):
-        max_overlap = max_overlaps[im_i] if is_cascade_rcnn else None
-        frcn_blobs = _sample_rois(
-            rpn_rois[im_i],
-            gt_classes[im_i],
-            is_crowd[im_i],
-            gt_boxes[im_i],
-            im_info[im_i],
-            batch_size_per_im,
-            fg_fraction,
-            fg_thresh,
-            bg_thresh_hi,
-            bg_thresh_lo,
-            bbox_reg_weights,
-            class_nums,
-            use_random,
-            is_cls_agnostic,
-            is_cascade_rcnn,
-            max_overlap,
-        )
-        lod.append(frcn_blobs['rois'].shape[0])
-        rois.append(frcn_blobs['rois'])
-        labels_int32.append(frcn_blobs['labels_int32'])
-        bbox_targets.append(frcn_blobs['bbox_targets'])
-        bbox_inside_weights.append(frcn_blobs['bbox_inside_weights'])
-        bbox_outside_weights.append(frcn_blobs['bbox_outside_weights'])
-        max_overlap_with_gt.append(frcn_blobs['max_overlap'])
-
-    return (
-        rois,
-        labels_int32,
-        bbox_targets,
-        bbox_inside_weights,
-        bbox_outside_weights,
-        max_overlap_with_gt,
-        lod,
-    )
-
-
-def filter_roi(rois, max_overlap):
-    ws = rois[:, 2] - rois[:, 0] + 1
-    hs = rois[:, 3] - rois[:, 1] + 1
-    keep = np.where((ws > 0) & (hs > 0) & (max_overlap < 1.0))[0]
-    if len(keep) > 0:
-        return rois[keep, :]
-    return np.zeros((1, 4)).astype('float32')
-
-
-def _sample_rois(
-    rpn_rois,
-    gt_classes,
-    is_crowd,
-    gt_boxes,
-    im_info,
-    batch_size_per_im,
-    fg_fraction,
-    fg_thresh,
-    bg_thresh_hi,
-    bg_thresh_lo,
-    bbox_reg_weights,
-    class_nums,
-    use_random,
-    is_cls_agnostic,
-    is_cascade_rcnn,
-    max_overlap,
-):
-    rois_per_image = int(batch_size_per_im)
-    fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))
-
-    # Roidb
-    im_scale = im_info[2]
-    inv_im_scale = 1.0 / im_scale
-    rpn_rois = rpn_rois * inv_im_scale
-
-    if is_cascade_rcnn:
-        rpn_rois = filter_roi(rpn_rois, max_overlap)
-
-    boxes = np.vstack([gt_boxes, rpn_rois])
-
-    gt_overlaps = np.zeros((boxes.shape[0], class_nums))
-    box_to_gt_ind_map = np.zeros((boxes.shape[0]), dtype=np.int32)
-    proposal_to_gt_overlaps = _bbox_overlaps(boxes, gt_boxes)
-
-    overlaps_argmax = proposal_to_gt_overlaps.argmax(axis=1)
-    overlaps_max = proposal_to_gt_overlaps.max(axis=1)
-    # Boxes which with non-zero overlap with gt boxes
-    overlapped_boxes_ind = np.where(overlaps_max > 0)[0]
-    overlapped_boxes_gt_classes = gt_classes[
-        overlaps_argmax[overlapped_boxes_ind]
-    ]
-    gt_overlaps[
-        overlapped_boxes_ind, overlapped_boxes_gt_classes
-    ] = overlaps_max[overlapped_boxes_ind]
-    box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[
-        overlapped_boxes_ind
-    ]
-
-    crowd_ind = np.where(is_crowd)[0]
-    gt_overlaps[crowd_ind] = -1.0
-    max_overlaps = gt_overlaps.max(axis=1)
-    max_classes = gt_overlaps.argmax(axis=1)
-
-    if is_cascade_rcnn:
-        # Cascade RCNN Decode Filter
-        fg_inds = np.where(max_overlaps >= fg_thresh)[0]
-        bg_inds = np.where(
-            (max_overlaps < bg_thresh_hi) & (max_overlaps >= bg_thresh_lo)
-        )[0]
-        fg_rois_per_this_image = fg_inds.shape[0]
-        bg_rois_per_this_image = bg_inds.shape[0]
-    else:
-        # Foreground
-        fg_inds = np.where(max_overlaps >= fg_thresh)[0]
-        fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0])
-        # Sample foreground if there are too many
-        if (fg_inds.shape[0] > fg_rois_per_this_image) and use_random:
-            fg_inds = np.random.choice(
-                fg_inds, size=fg_rois_per_this_image, replace=False
-            )
-        fg_inds = fg_inds[:fg_rois_per_this_image]
-        # Background
-        bg_inds = np.where(
-            (max_overlaps < bg_thresh_hi) & (max_overlaps >= bg_thresh_lo)
-        )[0]
-        bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
-        bg_rois_per_this_image = np.minimum(
-            bg_rois_per_this_image, bg_inds.shape[0]
-        )
-        # Sample background if there are too many
-        if (bg_inds.shape[0] > bg_rois_per_this_image) and use_random:
-            bg_inds = np.random.choice(
-                bg_inds, size=bg_rois_per_this_image, replace=False
-            )
-        bg_inds = bg_inds[:bg_rois_per_this_image]
-
-    keep_inds = np.append(fg_inds, bg_inds)
-    sampled_labels = max_classes[keep_inds]
-    sampled_labels[fg_rois_per_this_image:] = 0
-    sampled_boxes = boxes[keep_inds]
-    sampled_max_overlap = max_overlaps[keep_inds]
-    sampled_gts = gt_boxes[box_to_gt_ind_map[keep_inds]]
-    sampled_gts[fg_rois_per_this_image:, :] = gt_boxes[0]
-    bbox_label_targets = _compute_targets(
-        sampled_boxes, sampled_gts, sampled_labels, bbox_reg_weights
-    )
-    bbox_targets, bbox_inside_weights = _expand_bbox_targets(
-        bbox_label_targets, class_nums, is_cls_agnostic
-    )
-    bbox_outside_weights = np.array(
-        bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype
-    )
-    # Scale rois
-    sampled_rois = sampled_boxes * im_scale
-
-    # Faster RCNN blobs
-    frcn_blobs = {
-        'rois': sampled_rois,
-        'labels_int32': sampled_labels,
-        'bbox_targets': bbox_targets,
-        'bbox_inside_weights': bbox_inside_weights,
-        'bbox_outside_weights': bbox_outside_weights,
-        'max_overlap': sampled_max_overlap,
-    }
-    return frcn_blobs
-
-
-def _bbox_overlaps(roi_boxes, gt_boxes):
-    w1 = np.maximum(roi_boxes[:, 2] - roi_boxes[:, 0] + 1, 0)
-    h1 = np.maximum(roi_boxes[:, 3] - roi_boxes[:, 1] + 1, 0)
-    w2 = np.maximum(gt_boxes[:, 2] - gt_boxes[:, 0] + 1, 0)
-    h2 = np.maximum(gt_boxes[:, 3] - gt_boxes[:, 1] + 1, 0)
-    area1 = w1 * h1
-    area2 = w2 * h2
-
-    overlaps = np.zeros((roi_boxes.shape[0], gt_boxes.shape[0]))
-    for ind1 in range(roi_boxes.shape[0]):
-        for ind2 in range(gt_boxes.shape[0]):
-            inter_x1 = np.maximum(roi_boxes[ind1, 0], gt_boxes[ind2, 0])
-            inter_y1 = np.maximum(roi_boxes[ind1, 1], gt_boxes[ind2, 1])
-            inter_x2 = np.minimum(roi_boxes[ind1, 2], gt_boxes[ind2, 2])
-            inter_y2 = np.minimum(roi_boxes[ind1, 3], gt_boxes[ind2, 3])
-            inter_w = np.maximum(inter_x2 - inter_x1 + 1, 0)
-            inter_h = np.maximum(inter_y2 - inter_y1 + 1, 0)
-            inter_area = inter_w * inter_h
-            iou = inter_area / (area1[ind1] + area2[ind2] - inter_area)
-            overlaps[ind1, ind2] = iou
-    return overlaps
-
-
-def _compute_targets(roi_boxes, gt_boxes, labels, bbox_reg_weights):
-    assert roi_boxes.shape[0] == gt_boxes.shape[0]
-    assert roi_boxes.shape[1] == 4
-    assert gt_boxes.shape[1] == 4
-
-    targets = np.zeros(roi_boxes.shape)
-    bbox_reg_weights = np.asarray(bbox_reg_weights)
-    targets = _box_to_delta(
-        ex_boxes=roi_boxes, gt_boxes=gt_boxes, weights=bbox_reg_weights
-    )
-
-    return np.hstack([labels[:, np.newaxis], targets]).astype(
-        np.float32, copy=False
-    )
-
-
-def _box_to_delta(ex_boxes, gt_boxes, weights):
-    ex_w = ex_boxes[:, 2] - ex_boxes[:, 0] + 1
-    ex_h = ex_boxes[:, 3] - ex_boxes[:, 1] + 1
-    ex_ctr_x = ex_boxes[:, 0] + 0.5 * ex_w
-    ex_ctr_y = ex_boxes[:, 1] + 0.5 * ex_h
-
-    gt_w = gt_boxes[:, 2] - gt_boxes[:, 0] + 1
-    gt_h = gt_boxes[:, 3] - gt_boxes[:, 1] + 1
-    gt_ctr_x = gt_boxes[:, 0] + 0.5 * gt_w
-    gt_ctr_y = gt_boxes[:, 1] + 0.5 * gt_h
-
-    dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0]
-    dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1]
-    dw = (np.log(gt_w / ex_w)) / weights[2]
-    dh = (np.log(gt_h / ex_h)) / weights[3]
-
-    targets = np.vstack([dx, dy, dw, dh]).transpose()
-    return targets
-
-
-def _expand_bbox_targets(bbox_targets_input, class_nums, is_cls_agnostic):
-    class_labels = bbox_targets_input[:, 0]
-    fg_inds = np.where(class_labels > 0)[0]
-    # if is_cls_agnostic:
-    #     class_labels = [1 if ll > 0 else 0 for ll in class_labels]
-    #     class_labels = np.array(class_labels, dtype=np.int32)
-    #     class_nums = 2
-    bbox_targets = np.zeros(
-        (
-            class_labels.shape[0],
-            4 * class_nums if not is_cls_agnostic else 4 * 2,
-        )
-    )
-    bbox_inside_weights = np.zeros(bbox_targets.shape)
-    for ind in fg_inds:
-        class_label = int(class_labels[ind]) if not is_cls_agnostic else 1
-        start_ind = class_label * 4
-        end_ind = class_label * 4 + 4
-        bbox_targets[ind, start_ind:end_ind] = bbox_targets_input[ind, 1:]
-        bbox_inside_weights[ind, start_ind:end_ind] = (1.0, 1.0, 1.0, 1.0)
-    return bbox_targets, bbox_inside_weights
-
-
-class TestGenerateProposalLabelsOp(OpTest):
-    def set_data(self):
-        # self.use_random = False
-        self.init_use_random()
-        self.init_test_params()
-        self.init_test_input()
-        self.init_test_cascade()
-        self.init_test_output()
-
-        self.inputs = {
-            'RpnRois': (self.rpn_rois[0], self.rpn_rois_lod),
-            'GtClasses': (self.gt_classes[0], self.gts_lod),
-            'IsCrowd': (self.is_crowd[0], self.gts_lod),
-            'GtBoxes': (self.gt_boxes[0], self.gts_lod),
-            'ImInfo': self.im_info,
-        }
-        if self.max_overlaps is not None:
-            self.inputs['MaxOverlap'] = (
-                self.max_overlaps[0],
-                self.rpn_rois_lod,
-            )
-
-        self.attrs = {
-            'batch_size_per_im': self.batch_size_per_im,
-            'fg_fraction': self.fg_fraction,
-            'fg_thresh': self.fg_thresh,
-            'bg_thresh_hi': self.bg_thresh_hi,
-            'bg_thresh_lo': self.bg_thresh_lo,
-            'bbox_reg_weights': self.bbox_reg_weights,
-            'class_nums': self.class_nums,
-            'use_random': self.use_random,
-            'is_cls_agnostic': self.is_cls_agnostic,
-            'is_cascade_rcnn': self.is_cascade_rcnn,
-        }
-        self.outputs = {
-            'Rois': (self.rois, [self.lod]),
-            'LabelsInt32': (self.labels_int32, [self.lod]),
-            'BboxTargets': (self.bbox_targets, [self.lod]),
-            'BboxInsideWeights': (self.bbox_inside_weights, [self.lod]),
-            'BboxOutsideWeights': (self.bbox_outside_weights, [self.lod]),
-            'MaxOverlapWithGT': (self.max_overlap_with_gt, [self.lod]),
-        }
-
-    def test_check_output(self):
-        # NODE(yjjiang11): This op will be deprecated.
-        self.check_output(check_dygraph=False)
-
-    def setUp(self):
-        self.op_type = 'generate_proposal_labels'
-        self.set_data()
-
-    def init_test_cascade(
-        self,
-    ):
-        self.is_cascade_rcnn = False
-        self.max_overlaps = None
-
-    def init_use_random(self):
-        self.use_random = False
-
-    def init_test_params(self):
-        self.batch_size_per_im = 100
-        self.fg_fraction = 0.25
-        self.fg_thresh = 0.5
-        self.bg_thresh_hi = 0.5
-        self.bg_thresh_lo = 0.0
-        self.bbox_reg_weights = [0.1, 0.1, 0.2, 0.2]
-        self.is_cls_agnostic = False
-        self.class_nums = 2 if self.is_cls_agnostic else 81
-
-    def init_test_input(self):
-        np.random.seed(0)
-        gt_nums = 6  # Keep same with batch_size_per_im for unittest
-        proposal_nums = 200
-        images_shape = [[64, 64]]
-        self.im_info = np.ones((len(images_shape), 3)).astype(np.float32)
-        for i in range(len(images_shape)):
-            self.im_info[i, 0] = images_shape[i][0]
-            self.im_info[i, 1] = images_shape[i][1]
-            self.im_info[i, 2] = 0.8  # scale
-
-        self.rpn_rois, self.rpn_rois_lod = _generate_proposals(
-            images_shape, proposal_nums
-        )
-        ground_truth, self.gts_lod = _generate_groundtruth(
-            images_shape, self.class_nums, gt_nums
-        )
-
-        self.gt_classes = [gt['gt_classes'] for gt in ground_truth]
-        self.gt_boxes = [gt['boxes'] for gt in ground_truth]
-        self.is_crowd = [gt['is_crowd'] for gt in ground_truth]
-
-    def init_test_output(self):
-        (
-            self.rois,
-            self.labels_int32,
-            self.bbox_targets,
-            self.bbox_inside_weights,
-            self.bbox_outside_weights,
-            self.max_overlap_with_gt,
-            self.lod,
-        ) = generate_proposal_labels_in_python(
-            self.rpn_rois,
-            self.gt_classes,
-            self.is_crowd,
-            self.gt_boxes,
-            self.im_info,
-            self.batch_size_per_im,
-            self.fg_fraction,
-            self.fg_thresh,
-            self.bg_thresh_hi,
-            self.bg_thresh_lo,
-            self.bbox_reg_weights,
-            self.class_nums,
-            self.use_random,
-            self.is_cls_agnostic,
-            self.is_cascade_rcnn,
-            self.max_overlaps,
-        )
-        self.rois = np.vstack(self.rois)
-        self.labels_int32 = np.hstack(self.labels_int32)
-        self.labels_int32 = self.labels_int32[:, np.newaxis]
-        self.bbox_targets = np.vstack(self.bbox_targets)
-        self.bbox_inside_weights = np.vstack(self.bbox_inside_weights)
-        self.bbox_outside_weights = np.vstack(self.bbox_outside_weights)
-        self.max_overlap_with_gt = np.concatenate(self.max_overlap_with_gt)
-
-
-class TestCascade(TestGenerateProposalLabelsOp):
-    def init_test_cascade(self):
-        self.is_cascade_rcnn = True
-        roi_num = len(self.rpn_rois[0])
-        self.max_overlaps = []
-        max_overlap = np.random.rand(roi_num).astype('float32')
-        # Make GT samples with overlap = 1
-        max_overlap[max_overlap > 0.9] = 1.0
-        self.max_overlaps.append(max_overlap)
-
-
-class TestUseRandom(TestGenerateProposalLabelsOp):
-    def init_use_random(self):
-        self.use_random = True
-        self.is_cascade_rcnn = False
-
-    def test_check_output(self):
-        self.check_output_customized(self.verify_out)
-
-    def verify_out(self, outs):
-        print("skip")
-
-    def init_test_params(self):
-        self.batch_size_per_im = 512
-        self.fg_fraction = 0.025
-        self.fg_thresh = 0.5
-        self.bg_thresh_hi = 0.5
-        self.bg_thresh_lo = 0.0
-        self.bbox_reg_weights = [0.1, 0.1, 0.2, 0.2]
-        self.is_cls_agnostic = False
-        self.class_nums = 2 if self.is_cls_agnostic else 81
-
-
-class TestClsAgnostic(TestCascade):
-    def init_test_params(self):
-        self.batch_size_per_im = 512
-        self.fg_fraction = 0.25
-        self.fg_thresh = 0.5
-        self.bg_thresh_hi = 0.5
-        self.bg_thresh_lo = 0.0
-        self.bbox_reg_weights = [0.1, 0.1, 0.2, 0.2]
-        self.is_cls_agnostic = True
-        self.class_nums = 2 if self.is_cls_agnostic else 81
-
-
-class TestOnlyGT(TestCascade):
-    def init_test_input(self):
-        np.random.seed(0)
-        gt_nums = 6  # Keep same with batch_size_per_im for unittest
-        proposal_nums = 6
-        images_shape = [[64, 64]]
-        self.im_info = np.ones((len(images_shape), 3)).astype(np.float32)
-        for i in range(len(images_shape)):
-            self.im_info[i, 0] = images_shape[i][0]
-            self.im_info[i, 1] = images_shape[i][1]
-            self.im_info[i, 2] = 0.8  # scale
-
-        ground_truth, self.gts_lod = _generate_groundtruth(
-            images_shape, self.class_nums, gt_nums
-        )
-
-        self.gt_classes = [gt['gt_classes'] for gt in ground_truth]
-        self.gt_boxes = [gt['boxes'] for gt in ground_truth]
-        self.is_crowd = [gt['is_crowd'] for gt in ground_truth]
-        self.rpn_rois = self.gt_boxes
-        self.rpn_rois_lod = self.gts_lod
-
-
-class TestOnlyGT2(TestCascade):
-    def init_test_cascade(self):
-        self.is_cascade_rcnn = True
-        roi_num = len(self.rpn_rois[0])
-        self.max_overlaps = []
-        max_overlap = np.ones(roi_num).astype('float32')
-        self.max_overlaps.append(max_overlap)
-
-
-def _generate_proposals(images_shape, proposal_nums):
-    rpn_rois = []
-    rpn_rois_lod = []
-    num_proposals = 0
-    for i, image_shape in enumerate(images_shape):
-        proposals = _generate_boxes(image_shape, proposal_nums)
-        rpn_rois.append(proposals)
-        num_proposals = len(proposals)
-        rpn_rois_lod.append(num_proposals)
-    return rpn_rois, [rpn_rois_lod]
-
-
-def _generate_groundtruth(images_shape, class_nums, gt_nums):
-    ground_truth = []
-    gts_lod = []
-    num_gts = 0
-    for i, image_shape in enumerate(images_shape):
-        # Avoid background
-        gt_classes = np.random.randint(
-            low=1, high=class_nums, size=gt_nums
-        ).astype(np.int32)
-        gt_boxes = _generate_boxes(image_shape, gt_nums)
-        is_crowd = np.zeros((gt_nums), dtype=np.int32)
-        is_crowd[0] = 1
-        ground_truth.append(
-            {'gt_classes': gt_classes, 'boxes': gt_boxes, 'is_crowd': is_crowd}
-        )
-        num_gts += len(gt_classes)
-        gts_lod.append(num_gts)
-    return ground_truth, [gts_lod]
-
-
-def _generate_boxes(image_size, box_nums):
-    width = image_size[0]
-    height = image_size[1]
-    xywh = np.random.rand(box_nums, 4)
-    xy1 = xywh[:, [0, 1]] * image_size
-    wh = xywh[:, [2, 3]] * (image_size - xy1)
-    xy2 = xy1 + wh
-    boxes = np.hstack([xy1, xy2])
-    boxes[:, [0, 2]] = np.minimum(
-        width - 1.0, np.maximum(0.0, boxes[:, [0, 2]])
-    )
-    boxes[:, [1, 3]] = np.minimum(
-        height - 1.0, np.maximum(0.0, boxes[:, [1, 3]])
-    )
-    return boxes.astype(np.float32)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_layers.py b/test/legacy_test/test_layers.py
index b2e3691eac705..9d60992c186d9 100644
--- a/test/legacy_test/test_layers.py
+++ b/test/legacy_test/test_layers.py
@@ -30,7 +30,6 @@
     batch_fc,
     partial_concat,
     partial_sum,
-    rank_attention,
     shuffle_batch,
 )
 from paddle.pir_utils import test_with_pir_api
@@ -2266,27 +2265,6 @@ def test_batch_fc(self):
             )
         return out
 
-    def test_rank_attention(self):
-        with self.static_graph():
-            input = paddle.static.data(
-                name="input", shape=[None, 2], dtype="float32"
-            )
-            rank_offset = paddle.static.data(
-                name="rank_offset", shape=[None, 7], dtype="int32"
-            )
-            out = rank_attention(
-                input=input,
-                rank_offset=rank_offset,
-                rank_param_shape=[18, 3],
-                rank_param_attr=base.ParamAttr(
-                    learning_rate=1.0,
-                    name="ubm_rank_param.w_0",
-                    initializer=paddle.nn.initializer.XavierNormal(),
-                ),
-                max_rank=3,
-            )
-            return out
-
     def test_row_conv(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():
diff --git a/test/legacy_test/test_linear_chain_crf_op.py b/test/legacy_test/test_linear_chain_crf_op.py
deleted file mode 100755
index 6899a34063378..0000000000000
--- a/test/legacy_test/test_linear_chain_crf_op.py
+++ /dev/null
@@ -1,266 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class LinearChainCrfForward:
-    def __init__(
-        self,
-        seq_start_positions,
-        emission_weights,
-        emission_row_max,
-        emission_exps,
-        transition_weights,
-        transition_exps,
-        labels,
-    ):
-        self.tag_num = emission_weights.shape[1]
-        self.seq_num = len(seq_start_positions) - 1
-
-        self.seq_start_positions = seq_start_positions
-        self.labels = labels
-        self.x = emission_weights
-
-        self.x_row_max = emission_row_max
-        self.x_exps = emission_exps
-
-        # unnormalized logits of the transition weights for the start mark.
-        self.a = transition_weights[0, :]
-        self.a_exps = transition_exps[0, :]
-        # unnormalized logits of the transition weights for the end mark.
-        self.b = transition_weights[1, :]
-        self.b_exps = transition_exps[1, :]
-        # unnormalized logits of the transition weights for all the other tags.
-        self.w = transition_weights[2:, :]
-        self.w_exps = transition_exps[2:, :]
-
-        # The output of linear chain crf operator.
-        # alpha is a memo table in dynamic programming to calculate
-        # nomalization factor.
-        self.alpha = np.zeros(
-            (seq_start_positions[-1], self.tag_num), dtype="float64"
-        )
-        self.log_likelihood = np.zeros((self.seq_num, 1))
-
-    def _l1_norm(self, x):
-        s = np.sum(x)
-        x /= s
-        return s
-
-    def _forward_a_sequence(self, x, x_row_max, x_exps, label, alpha):
-        seq_len = x_row_max.shape[0]
-        log_likelihood = 0.0
-
-        for i in range(self.tag_num):
-            alpha[0, i] = self.a_exps[i] * x_exps[0, i]
-        log_likelihood = -x_row_max[0] - np.log(self._l1_norm(alpha[0, :]))
-
-        # calculate the unnormalized logits of the normalization factor.
-        for k in range(1, seq_len):
-            for i in range(self.tag_num):
-                s = 0.0
-                for j in range(self.tag_num):
-                    s += alpha[k - 1, j] * self.w_exps[j, i]
-                alpha[k, i] = x_exps[k, i] * s
-            log_likelihood -= x_row_max[k] + np.log(self._l1_norm(alpha[k, :]))
-        s = 0.0
-        for i in range(self.tag_num):
-            s += alpha[-1, i] * self.b_exps[i]
-        log_likelihood -= np.log(s)
-
-        # calculate the nominator part.
-        log_likelihood += self.a[label[0]] + x[0, label[0]] + self.b[label[-1]]
-
-        for k in range(1, seq_len):
-            log_likelihood += x[k, label[k]] + self.w[label[k - 1], label[k]]
-        return -log_likelihood
-
-    def crf_forward_compute(self):
-        for i in range(self.seq_num):
-            start = self.seq_start_positions[i]
-            end = self.seq_start_positions[i + 1]
-            if start >= end:
-                continue
-            self.log_likelihood[i] = self._forward_a_sequence(
-                self.x[start:end, :],
-                self.x_row_max[start:end, :],
-                self.x_exps[start:end, :],
-                self.labels[start:end, :],
-                self.alpha[start:end, :],
-            )
-        return self.alpha, self.log_likelihood
-
-
-class TestLinearChainCrfOp(OpTest):
-    def set_test_data(self):
-        # TODO(caoying) Fix the unittest by: add the boundary cases when
-        # sequence lengths are 1, 2, and 3.
-
-        SEQ_NUM = 3
-        TAG_NUM = 17
-        MAX_SEQ_LEN = 5
-
-        # the linear_chain_crf operator only supports sequence (LoD level = 1)
-        lod = [[]]
-        seq_start_pos = [0]
-        for i in range(SEQ_NUM):
-            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
-            seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1])
-        emission = np.random.uniform(
-            -1, 1, [seq_start_pos[-1], TAG_NUM]
-        ).astype("float64")
-        emission_row_max = np.amax(emission, axis=1, keepdims=True)
-        emission_exps = np.exp(emission - emission_row_max)
-
-        transition = np.random.uniform(
-            -0.5, 0.5, [TAG_NUM + 2, TAG_NUM]
-        ).astype("float64")
-        transition_exps = np.exp(transition)
-
-        labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(seq_start_pos[-1], 1), dtype="int64"
-        )
-
-        self.inputs = {
-            "Emission": (emission, lod),
-            "Transition": transition,
-            "Label": (labels, lod),
-        }
-        crf = LinearChainCrfForward(
-            seq_start_pos,
-            emission,
-            emission_row_max,
-            emission_exps,
-            transition,
-            transition_exps,
-            labels,
-        )
-        alpha, log_likelihood = crf.crf_forward_compute()
-
-        self.outputs = {
-            "Alpha": alpha,
-            "EmissionExps": emission_exps,
-            "TransitionExps": transition_exps,
-            "LogLikelihood": log_likelihood,
-        }
-
-    def setUp(self):
-        self.op_type = "linear_chain_crf"
-        self.set_test_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["Emission", "Transition"], "LogLikelihood")
-
-    def test_check_grad_ignore_transition(self):
-        self.check_grad(
-            ["Emission"], "LogLikelihood", no_grad_set=set("Transition")
-        )
-
-
-class TestLinearChainCrfPaddingTensor(OpTest):
-    def seq_pad(self, data, length):
-        max_len = np.max(length)
-        shape = [len(length), max_len] + list(data.shape[1:])
-        padded = np.zeros(shape).astype(data.dtype)
-        offset = 0
-        for i, l in enumerate(length):
-            padded[i, 0:l] = data[offset : offset + l]
-            offset += l
-        return padded
-
-    def seq_pad_exps(self, data, length):
-        # Adding for transition_exps
-        max_len = np.max(length)
-        shape = [len(length), max_len] + list(data.shape[1:])
-        padded = np.ones(shape).astype(data.dtype)
-        offset = 0
-        for i, l in enumerate(length):
-            padded[i, 0:l] = data[offset : offset + l]
-            offset += l
-        return padded
-
-    def set_test_data_1(self):
-        # Fix the unittest by: add padding tensor in inputs
-        SEQ_NUM = 3
-        TAG_NUM = 17
-        MAX_SEQ_LEN = 5
-
-        # the linear_chain_crf operator only supports sequence (LoD level = 1)
-        lod = [[]]
-        seq_start_pos = [0]
-        for i in range(SEQ_NUM):
-            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
-            seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1])
-        emission = np.random.uniform(
-            -1, 1, [seq_start_pos[-1], TAG_NUM]
-        ).astype("float64")
-        emission_row_max = np.amax(emission, axis=1, keepdims=True)
-        emission_exps = np.exp(emission - emission_row_max)
-        transition = np.random.uniform(
-            -0.5, 0.5, [TAG_NUM + 2, TAG_NUM]
-        ).astype("float64")
-        transition_exps = np.exp(transition)
-
-        labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(seq_start_pos[-1], 1), dtype="int64"
-        )
-        self.inputs = {
-            "Emission": self.seq_pad(emission, lod[0]),
-            "Transition": transition,
-            "Label": self.seq_pad(labels, lod[0]),
-            "Length": np.array(lod).astype("int64"),
-        }
-        crf = LinearChainCrfForward(
-            seq_start_pos,
-            emission,
-            emission_row_max,
-            emission_exps,
-            transition,
-            transition_exps,
-            labels,
-        )
-        alpha, log_likelihood = crf.crf_forward_compute()
-        self.outputs = {
-            "Alpha": self.seq_pad(alpha, lod[0]),
-            "EmissionExps": self.seq_pad_exps(emission_exps, lod[0]),
-            "TransitionExps": transition_exps,
-            "LogLikelihood": log_likelihood,
-        }
-
-    def setUp(self):
-        self.op_type = "linear_chain_crf"
-        self.set_test_data_1()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["Emission", "Transition"], "LogLikelihood")
-
-    def test_check_grad_ignore_transition(self):
-        self.check_grad(
-            ["Emission"], "LogLikelihood", no_grad_set=set("Transition")
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_minus_op.py b/test/legacy_test/test_minus_op.py
deleted file mode 100644
index 26d01a179ff46..0000000000000
--- a/test/legacy_test/test_minus_op.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle
-
-
-class TestMinusOp(OpTest):
-    def setUp(self):
-        self.op_type = "minus"
-        self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((32, 84)).astype("float32"),
-        }
-        self.outputs = {'Out': (self.inputs['X'] - self.inputs['Y'])}
-
-    def test_check_output(self):
-        # NODE(yjjiang11): This op will be deprecated.
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out', check_dygraph=False)
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_precision_recall_op.py b/test/legacy_test/test_precision_recall_op.py
deleted file mode 100644
index 97f3d7e7724a4..0000000000000
--- a/test/legacy_test/test_precision_recall_op.py
+++ /dev/null
@@ -1,206 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-def calc_precision(tp_count, fp_count):
-    if tp_count > 0.0 or fp_count > 0.0:
-        return tp_count / (tp_count + fp_count)
-    return 1.0
-
-
-def calc_recall(tp_count, fn_count):
-    if tp_count > 0.0 or fn_count > 0.0:
-        return tp_count / (tp_count + fn_count)
-    return 1.0
-
-
-def calc_f1_score(precision, recall):
-    if precision > 0.0 or recall > 0.0:
-        return 2 * precision * recall / (precision + recall)
-    return 0.0
-
-
-def get_states(idxs, labels, cls_num, weights=None):
-    ins_num = idxs.shape[0]
-    # TP FP TN FN
-    states = np.zeros((cls_num, 4)).astype('float32')
-    for i in range(ins_num):
-        w = weights[i] if weights is not None else 1.0
-        idx = idxs[i][0]
-        label = labels[i][0]
-        if idx == label:
-            states[idx][0] += w
-            for j in range(cls_num):
-                states[j][2] += w
-            states[idx][2] -= w
-        else:
-            states[label][3] += w
-            states[idx][1] += w
-            for j in range(cls_num):
-                states[j][2] += w
-            states[label][2] -= w
-            states[idx][2] -= w
-    return states
-
-
-def compute_metrics(states, cls_num):
-    total_tp_count = 0.0
-    total_fp_count = 0.0
-    total_fn_count = 0.0
-    macro_avg_precision = 0.0
-    macro_avg_recall = 0.0
-    for i in range(cls_num):
-        total_tp_count += states[i][0]
-        total_fp_count += states[i][1]
-        total_fn_count += states[i][3]
-        macro_avg_precision += calc_precision(states[i][0], states[i][1])
-        macro_avg_recall += calc_recall(states[i][0], states[i][3])
-    metrics = []
-    macro_avg_precision /= cls_num
-    macro_avg_recall /= cls_num
-    metrics.append(macro_avg_precision)
-    metrics.append(macro_avg_recall)
-    metrics.append(calc_f1_score(macro_avg_precision, macro_avg_recall))
-    micro_avg_precision = calc_precision(total_tp_count, total_fp_count)
-    metrics.append(micro_avg_precision)
-    micro_avg_recall = calc_recall(total_tp_count, total_fn_count)
-    metrics.append(micro_avg_recall)
-    metrics.append(calc_f1_score(micro_avg_precision, micro_avg_recall))
-    return np.array(metrics).astype('float32')
-
-
-class TestPrecisionRecallOp_0(OpTest):
-    def setUp(self):
-        self.op_type = "precision_recall"
-        ins_num = 64
-        cls_num = 10
-        max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = (
-            np.random.choice(range(cls_num), ins_num)
-            .reshape((ins_num, 1))
-            .astype('int32')
-        )
-        labels = (
-            np.random.choice(range(cls_num), ins_num)
-            .reshape((ins_num, 1))
-            .astype('int32')
-        )
-        states = get_states(idxs, labels, cls_num)
-        metrics = compute_metrics(states, cls_num)
-
-        self.attrs = {'class_number': cls_num}
-
-        self.inputs = {'MaxProbs': max_probs, 'Indices': idxs, 'Labels': labels}
-
-        self.outputs = {
-            'BatchMetrics': metrics,
-            'AccumMetrics': metrics,
-            'AccumStatesInfo': states,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestPrecisionRecallOp_1(OpTest):
-    def setUp(self):
-        self.op_type = "precision_recall"
-        ins_num = 64
-        cls_num = 10
-        max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = (
-            np.random.choice(range(cls_num), ins_num)
-            .reshape((ins_num, 1))
-            .astype('int32')
-        )
-        weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        labels = (
-            np.random.choice(range(cls_num), ins_num)
-            .reshape((ins_num, 1))
-            .astype('int32')
-        )
-
-        states = get_states(idxs, labels, cls_num, weights)
-        metrics = compute_metrics(states, cls_num)
-
-        self.attrs = {'class_number': cls_num}
-
-        self.inputs = {
-            'MaxProbs': max_probs,
-            'Indices': idxs,
-            'Labels': labels,
-            'Weights': weights,
-        }
-
-        self.outputs = {
-            'BatchMetrics': metrics,
-            'AccumMetrics': metrics,
-            'AccumStatesInfo': states,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestPrecisionRecallOp_2(OpTest):
-    def setUp(self):
-        self.op_type = "precision_recall"
-        ins_num = 64
-        cls_num = 10
-        max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = (
-            np.random.choice(range(cls_num), ins_num)
-            .reshape((ins_num, 1))
-            .astype('int32')
-        )
-        weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        labels = (
-            np.random.choice(range(cls_num), ins_num)
-            .reshape((ins_num, 1))
-            .astype('int32')
-        )
-        states = np.random.randint(0, 30, (cls_num, 4)).astype('float32')
-
-        accum_states = get_states(idxs, labels, cls_num, weights)
-        batch_metrics = compute_metrics(accum_states, cls_num)
-        accum_states += states
-        accum_metrics = compute_metrics(accum_states, cls_num)
-
-        self.attrs = {'class_number': cls_num}
-
-        self.inputs = {
-            'MaxProbs': max_probs,
-            'Indices': idxs,
-            'Labels': labels,
-            'Weights': weights,
-            'StatesInfo': states,
-        }
-
-        self.outputs = {
-            'BatchMetrics': batch_metrics,
-            'AccumMetrics': accum_metrics,
-            'AccumStatesInfo': accum_states,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_queue.py b/test/legacy_test/test_queue.py
deleted file mode 100644
index 5a1cbd53d43aa..0000000000000
--- a/test/legacy_test/test_queue.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-import paddle
-from paddle import base
-from paddle.base import core
-
-
-class TestQueue(unittest.TestCase):
-    def test_eq(self):
-        """
-        test queue_generator op, enqueue op and dequeue op.
-        """
-
-        main_program = base.Program()
-        startup_program = base.Program()
-        value = np.random.rand(1)
-        with base.program_guard(main_program, startup_program):
-            data_in = paddle.static.create_global_var(
-                shape=[2, 3],
-                value=value,
-                dtype="float32",
-                persistable=True,
-                name='var_in',
-            )
-            data_out = paddle.static.create_global_var(
-                shape=[2, 3],
-                value=value - 1.0,
-                dtype="float32",
-                persistable=True,
-                name='var_out',
-            )
-        startup_block = startup_program.block(0)
-        queue_name = 'blocking_queue'
-        startup_block.create_var(
-            name=queue_name, persistable=True, type=core.VarDesc.VarType.RAW
-        )
-        startup_block.append_op(
-            type="queue_generator", attrs={'names': [queue_name]}
-        )
-        block = main_program.block(0)
-        block.append_op(
-            type='enqueue',
-            inputs={'X': data_in},
-            attrs={'queue_name': queue_name},
-        )
-        block.append_op(
-            type='dequeue',
-            outputs={'Out': [data_out]},
-            attrs={'queue_name': queue_name},
-        )
-
-        place = (
-            base.CUDAPlace(0)
-            if core.is_compiled_with_cuda()
-            else base.CPUPlace()
-        )
-        exe = base.Executor(place)
-        exe.run(startup_program)
-        (ret,) = exe.run(main_program, fetch_list=[data_out.name])
-        np.testing.assert_allclose(
-            np.asarray(ret), np.full((2, 3), value, np.float32), rtol=1e-05
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_rank_attention_op.py b/test/legacy_test/test_rank_attention_op.py
deleted file mode 100644
index 514463b0cbae4..0000000000000
--- a/test/legacy_test/test_rank_attention_op.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-from paddle.base import core
-
-
-def gen_input_help(input, rank_offset, max_rank, max_size):
-    input_row, input_col = input.shape
-    max_ins = np.max((max_size, input_row))
-    input_help = np.zeros(max_ins * max_rank * input_col)
-    ins_rank = np.zeros((max_ins, 1))
-    ins_rank.fill(-1)
-
-    output_col = max_rank * input_col
-    output_row = input_row
-
-    for idx in range(output_col * output_row):
-        output_col_idx = idx % output_col
-        output_row_idx = int(idx / output_col)
-        k = int(output_col_idx / input_col)
-        faster = rank_offset[output_row_idx, 2 * k + 1] - 1
-
-        if output_col_idx == 0:
-            ins_rank[output_row_idx] = rank_offset[output_row_idx, 0]
-
-        if rank_offset[output_row_idx, 0] - 1 < 0 or faster < 0:
-            continue
-
-        rank_input_col_idx = output_col_idx % input_col
-        index = rank_offset[output_row_idx, 2 * k + 2]
-        input_help[idx] = input[index, rank_input_col_idx]
-    input_help = input_help.reshape([max_ins, max_rank * input_col])
-
-    return input_help, ins_rank
-
-
-def gen_param_help(input, rank_offset, param, max_rank):
-    input_row, input_col = input.shape
-    rank_offset_row, rank_offset_col = rank_offset.shape
-    param_row, param_col = param.shape
-
-    block_matrix_row = input_col * max_rank
-
-    output_param_row = block_matrix_row * input_row
-    output_param_col = param_col
-
-    output_param = np.zeros((output_param_row * output_param_col,))
-
-    for idx in range(output_param_row * output_param_col):
-        output_col_idx = idx % output_param_col
-        output_row_idx = int(idx / output_param_col)
-        ins_idx = int(output_row_idx / block_matrix_row)
-        start_offset = output_row_idx % block_matrix_row
-        k = int(start_offset / input_col)
-        k_offset = start_offset % input_col
-
-        lower = rank_offset[ins_idx, 0] - 1
-        faster = rank_offset[ins_idx, 2 * k + 1] - 1
-        if lower < 0 or faster < 0:
-            continue
-        start = lower * max_rank + faster
-        ori_idx = (
-            start * param_col * input_col
-            + k_offset * param_col
-            + output_col_idx
-        )
-        output_param[idx] = param[int(ori_idx / param_col), ori_idx % param_col]
-
-    output_param = output_param.reshape([output_param_row, output_param_col])
-    return output_param
-
-
-def np_rank_attention(input, rank_offset, rank_para, max_rank, max_size):
-    input_row, input_col = input.shape
-    rank_offset_row, rank_offset_col = rank_offset.shape
-    rank_para_row, rank_para_col = rank_para.shape
-
-    assert input_row == rank_offset_row
-    assert max_rank == ((rank_offset_col - 1) / 2)
-    assert rank_para_row == max_rank * max_rank * input_col
-
-    input_help, ins_rank = gen_input_help(
-        input, rank_offset, max_rank, max_size
-    )
-    param_help = gen_param_help(input, rank_offset, rank_para, max_rank)
-    block_matrix_row = input_col * max_rank
-
-    res = np.zeros((input_row, rank_para_col))
-    for ins in range(input_row):
-        res[ins, :] = np.dot(
-            input_help[ins, :],
-            param_help[
-                int(block_matrix_row * ins) : int(block_matrix_row * (ins + 1)),
-                :,
-            ],
-        )
-    return res, input_help, param_help, ins_rank
-
-
-def gen_rank_offset(pv_nums, max_rank):
-    all_ins_num = 0
-    pv_rank_msg = []
-    for _ in range(pv_nums):
-        ins_pv = np.random.randint(1, max_rank + 2)  # 1~4
-        rank_list = list(range(1, ins_pv + 1))
-        random.shuffle(rank_list)
-        all_ins_num = all_ins_num + ins_pv
-        pv_rank_msg.append(rank_list)
-
-    rank_offset = np.zeros((all_ins_num, max_rank * 2 + 1)).astype("int32")
-    rank_offset.fill(-1)
-    index = 0
-    for pv_number in range(len(pv_rank_msg)):
-        pv_ins = pv_rank_msg[pv_number]
-        ad_num = len(pv_ins)
-        index_start = index
-
-        for j in range(ad_num):
-            rank = -1
-            if pv_ins[j] <= max_rank:
-                rank = pv_ins[j]
-            rank_offset[index, 0] = rank
-
-            if rank > 0:
-                for k in range(ad_num):
-                    fast_rank = -1
-                    if pv_ins[k] <= max_rank:
-                        fast_rank = pv_ins[k]
-                    if fast_rank > 0:
-                        m = fast_rank - 1
-                        rank_offset[index, 2 * m + 1] = pv_ins[k]
-                        rank_offset[index, 2 * m + 2] = index_start + k
-            index = index + 1
-    return all_ins_num, rank_offset
-
-
-class TestRankAttentionOpComplex(OpTest):
-    def config(self):
-        self.pv_num = 100
-        self.x_feat = 10
-        self.y_feat = 15
-        self.max_rank = 3
-        self.dtype = "float64"
-
-    def setUp(self):
-        self.op_type = "rank_attention"
-        self.config()
-        ins_num, rank_offset = gen_rank_offset(self.pv_num, self.max_rank)
-        input = np.random.random((ins_num, self.x_feat)).astype(self.dtype)
-        rank_para_shape = [
-            self.max_rank * self.max_rank * self.x_feat,
-            self.y_feat,
-        ]
-        rank_para = np.random.random(rank_para_shape).astype(self.dtype)
-        np_out, np_input_help, np_param_help, np_ins_rank = np_rank_attention(
-            input,
-            np.array(rank_offset),
-            rank_para,
-            self.max_rank,
-            self.pv_num * 7,
-        )
-        self.inputs = {
-            "X": input,
-            "RankOffset": np.array(rank_offset).astype("int32"),
-            "RankParam": rank_para,
-        }
-        self.attrs = {'MaxRank': self.max_rank, 'MaxSize': self.pv_num * 7}
-        self.outputs = {
-            "Out": np_out,
-            "InputHelp": np_input_help,
-            "InsRank": np_ins_rank,
-        }
-
-    def test_check_output_gpu(self):
-        if core.is_compiled_with_cuda():
-            self.check_output_with_place(core.CUDAPlace(0))
-
-    def test_check_grad_gpu(self):
-        if core.is_compiled_with_cuda():
-            self.check_grad_with_place(core.CUDAPlace(0), ["RankParam"], "Out")
-
-
-class TestRankAttentionOpCpu(OpTest):
-    def config(self):
-        self.pv_num = 100
-        self.x_feat = 10
-        self.y_feat = 15
-        self.max_rank = 3
-        self.dtype = "float64"
-
-    def setUp(self):
-        self.op_type = "rank_attention"
-        self.config()
-        ins_num, rank_offset = gen_rank_offset(self.pv_num, self.max_rank)
-        input = np.random.random((ins_num, self.x_feat)).astype(self.dtype)
-        rank_para_shape = [
-            self.max_rank * self.max_rank * self.x_feat,
-            self.y_feat,
-        ]
-        rank_para = np.random.random(rank_para_shape).astype(self.dtype)
-        np_out, np_input_help, np_param_help, np_ins_rank = np_rank_attention(
-            input,
-            np.array(rank_offset),
-            rank_para,
-            self.max_rank,
-            self.pv_num * 7,
-        )
-        self.inputs = {
-            "X": input,
-            "RankOffset": np.array(rank_offset).astype("int32"),
-            "RankParam": rank_para,
-        }
-        self.attrs = {'MaxRank': self.max_rank, 'MaxSize': self.pv_num * 7}
-        self.outputs = {
-            "Out": np_out,
-            "InputHelp": np_input_help,
-            "InsRank": np_ins_rank,
-        }
-
-    def test_check_output_cpu(self):
-        try:
-            self.check_output_with_place(place=core.CPUPlace())
-        except:
-            print("do not support cpu test, skip")
-
-    def test_check_grad_cpu(self):
-        try:
-            self.check_grad_with_place(core.CPUPlace(), ["RankParam"], "Out")
-        except:
-            print("do not support cpu test, skip")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_retinanet_detection_output.py b/test/legacy_test/test_retinanet_detection_output.py
deleted file mode 100644
index a120dfd50eefc..0000000000000
--- a/test/legacy_test/test_retinanet_detection_output.py
+++ /dev/null
@@ -1,511 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License")
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import unittest
-
-import numpy as np
-from op_test import OpTest
-from test_anchor_generator_op import anchor_generator_in_python
-from test_multiclass_nms_op import nms
-
-import paddle
-
-
-def multiclass_nms(prediction, class_num, keep_top_k, nms_threshold):
-    selected_indices = {}
-    num_det = 0
-    for c in range(class_num):
-        if c not in prediction.keys():
-            continue
-        cls_dets = prediction[c]
-        all_scores = np.zeros(len(cls_dets))
-        for i in range(all_scores.shape[0]):
-            all_scores[i] = cls_dets[i][4]
-        indices = nms(cls_dets, all_scores, 0.0, nms_threshold, -1, False, 1.0)
-        selected_indices[c] = indices
-        num_det += len(indices)
-
-    score_index = []
-    for c, indices in selected_indices.items():
-        for idx in indices:
-            score_index.append((prediction[c][idx][4], c, idx))
-
-    sorted_score_index = sorted(
-        score_index, key=lambda tup: tup[0], reverse=True
-    )
-    if keep_top_k > -1 and num_det > keep_top_k:
-        sorted_score_index = sorted_score_index[:keep_top_k]
-        num_det = keep_top_k
-    nmsed_outs = []
-    for s, c, idx in sorted_score_index:
-        xmin = prediction[c][idx][0]
-        ymin = prediction[c][idx][1]
-        xmax = prediction[c][idx][2]
-        ymax = prediction[c][idx][3]
-        nmsed_outs.append([c + 1, s, xmin, ymin, xmax, ymax])
-
-    return nmsed_outs, num_det
-
-
-def retinanet_detection_out(
-    boxes_list,
-    scores_list,
-    anchors_list,
-    im_info,
-    score_threshold,
-    nms_threshold,
-    nms_top_k,
-    keep_top_k,
-):
-    class_num = scores_list[0].shape[-1]
-    im_height, im_width, im_scale = im_info
-
-    num_level = len(scores_list)
-    prediction = {}
-    for lvl in range(num_level):
-        scores_per_level = scores_list[lvl]
-        scores_per_level = scores_per_level.flatten()
-        bboxes_per_level = boxes_list[lvl]
-        bboxes_per_level = bboxes_per_level.flatten()
-        anchors_per_level = anchors_list[lvl]
-        anchors_per_level = anchors_per_level.flatten()
-
-        thresh = score_threshold if lvl < (num_level - 1) else 0.0
-        selected_indices = np.argwhere(scores_per_level > thresh)
-        scores = scores_per_level[selected_indices]
-        sorted_indices = np.argsort(-scores, axis=0, kind='mergesort')
-        if nms_top_k > -1 and nms_top_k < sorted_indices.shape[0]:
-            sorted_indices = sorted_indices[:nms_top_k]
-
-        for i in range(sorted_indices.shape[0]):
-            idx = selected_indices[sorted_indices[i]]
-            idx = idx[0][0]
-            a = int(idx / class_num)
-            c = int(idx % class_num)
-            box_offset = a * 4
-            anchor_box_width = (
-                anchors_per_level[box_offset + 2]
-                - anchors_per_level[box_offset]
-                + 1
-            )
-            anchor_box_height = (
-                anchors_per_level[box_offset + 3]
-                - anchors_per_level[box_offset + 1]
-                + 1
-            )
-            anchor_box_center_x = (
-                anchors_per_level[box_offset] + anchor_box_width / 2
-            )
-            anchor_box_center_y = (
-                anchors_per_level[box_offset + 1] + anchor_box_height / 2
-            )
-
-            target_box_center_x = (
-                bboxes_per_level[box_offset] * anchor_box_width
-                + anchor_box_center_x
-            )
-            target_box_center_y = (
-                bboxes_per_level[box_offset + 1] * anchor_box_height
-                + anchor_box_center_y
-            )
-            target_box_width = (
-                math.exp(bboxes_per_level[box_offset + 2]) * anchor_box_width
-            )
-            target_box_height = (
-                math.exp(bboxes_per_level[box_offset + 3]) * anchor_box_height
-            )
-
-            pred_box_xmin = target_box_center_x - target_box_width / 2
-            pred_box_ymin = target_box_center_y - target_box_height / 2
-            pred_box_xmax = target_box_center_x + target_box_width / 2 - 1
-            pred_box_ymax = target_box_center_y + target_box_height / 2 - 1
-
-            pred_box_xmin = pred_box_xmin / im_scale
-            pred_box_ymin = pred_box_ymin / im_scale
-            pred_box_xmax = pred_box_xmax / im_scale
-            pred_box_ymax = pred_box_ymax / im_scale
-
-            pred_box_xmin = max(
-                min(pred_box_xmin, np.round(im_width / im_scale) - 1), 0.0
-            )
-            pred_box_ymin = max(
-                min(pred_box_ymin, np.round(im_height / im_scale) - 1), 0.0
-            )
-            pred_box_xmax = max(
-                min(pred_box_xmax, np.round(im_width / im_scale) - 1), 0.0
-            )
-            pred_box_ymax = max(
-                min(pred_box_ymax, np.round(im_height / im_scale) - 1), 0.0
-            )
-
-            if c not in prediction.keys():
-                prediction[c] = []
-            prediction[c].append(
-                [
-                    pred_box_xmin,
-                    pred_box_ymin,
-                    pred_box_xmax,
-                    pred_box_ymax,
-                    scores_per_level[idx],
-                ]
-            )
-
-    nmsed_outs, nmsed_num = multiclass_nms(
-        prediction, class_num, keep_top_k, nms_threshold
-    )
-    return nmsed_outs, nmsed_num
-
-
-def batched_retinanet_detection_out(
-    boxes,
-    scores,
-    anchors,
-    im_info,
-    score_threshold,
-    nms_threshold,
-    nms_top_k,
-    keep_top_k,
-):
-    batch_size = scores[0].shape[0]
-    det_outs = []
-    lod = []
-
-    for n in range(batch_size):
-        boxes_per_batch = []
-        scores_per_batch = []
-
-        num_level = len(scores)
-        for lvl in range(num_level):
-            boxes_per_batch.append(boxes[lvl][n])
-            scores_per_batch.append(scores[lvl][n])
-
-        nmsed_outs, nmsed_num = retinanet_detection_out(
-            boxes_per_batch,
-            scores_per_batch,
-            anchors,
-            im_info[n],
-            score_threshold,
-            nms_threshold,
-            nms_top_k,
-            keep_top_k,
-        )
-        lod.append(nmsed_num)
-        if nmsed_num == 0:
-            continue
-
-        det_outs.extend(nmsed_outs)
-    return det_outs, lod
-
-
-class TestRetinanetDetectionOutOp1(OpTest):
-    def set_argument(self):
-        self.score_threshold = 0.05
-        self.min_level = 3
-        self.max_level = 7
-        self.nms_threshold = 0.3
-        self.nms_top_k = 1000
-        self.keep_top_k = 200
-
-        self.scales_per_octave = 3
-        self.aspect_ratios = [1.0, 2.0, 0.5]
-        self.anchor_scale = 4
-        self.anchor_strides = [8, 16, 32, 64, 128]
-
-        self.box_size = 4
-        self.class_num = 80
-        self.batch_size = 1
-        self.input_channels = 20
-
-        self.layer_h = []
-        self.layer_w = []
-        num_levels = self.max_level - self.min_level + 1
-        for i in range(num_levels):
-            self.layer_h.append(2 ** (num_levels - i))
-            self.layer_w.append(2 ** (num_levels - i))
-
-    def init_test_input(self):
-        anchor_num = len(self.aspect_ratios) * self.scales_per_octave
-        num_levels = self.max_level - self.min_level + 1
-        self.scores_list = []
-        self.bboxes_list = []
-        self.anchors_list = []
-
-        for i in range(num_levels):
-            layer_h = self.layer_h[i]
-            layer_w = self.layer_w[i]
-
-            input_feat = np.random.random(
-                (self.batch_size, self.input_channels, layer_h, layer_w)
-            ).astype('float32')
-            score = np.random.random(
-                (self.batch_size, self.class_num * anchor_num, layer_h, layer_w)
-            ).astype('float32')
-            score = np.transpose(score, [0, 2, 3, 1])
-            score = score.reshape((self.batch_size, -1, self.class_num))
-            box = np.random.random(
-                (self.batch_size, self.box_size * anchor_num, layer_h, layer_w)
-            ).astype('float32')
-            box = np.transpose(box, [0, 2, 3, 1])
-            box = box.reshape((self.batch_size, -1, self.box_size))
-            anchor_sizes = []
-            for octave in range(self.scales_per_octave):
-                anchor_sizes.append(
-                    float(self.anchor_strides[i] * (2**octave))
-                    / float(self.scales_per_octave)
-                    * self.anchor_scale
-                )
-            anchor, var = anchor_generator_in_python(
-                input_feat=input_feat,
-                anchor_sizes=anchor_sizes,
-                aspect_ratios=self.aspect_ratios,
-                variances=[1.0, 1.0, 1.0, 1.0],
-                stride=[self.anchor_strides[i], self.anchor_strides[i]],
-                offset=0.5,
-            )
-            anchor = np.reshape(anchor, [-1, 4])
-            self.scores_list.append(score.astype('float32'))
-            self.bboxes_list.append(box.astype('float32'))
-            self.anchors_list.append(anchor.astype('float32'))
-
-        self.im_info = np.array([[256.0, 256.0, 1.5]]).astype(
-            'float32'
-        )  # im_height, im_width, scale
-
-    def setUp(self):
-        self.set_argument()
-        self.init_test_input()
-
-        nmsed_outs, lod = batched_retinanet_detection_out(
-            self.bboxes_list,
-            self.scores_list,
-            self.anchors_list,
-            self.im_info,
-            self.score_threshold,
-            self.nms_threshold,
-            self.nms_top_k,
-            self.keep_top_k,
-        )
-        nmsed_outs = np.array(nmsed_outs).astype('float32')
-        self.op_type = 'retinanet_detection_output'
-        self.inputs = {
-            'BBoxes': [
-                ('b0', self.bboxes_list[0]),
-                ('b1', self.bboxes_list[1]),
-                ('b2', self.bboxes_list[2]),
-                ('b3', self.bboxes_list[3]),
-                ('b4', self.bboxes_list[4]),
-            ],
-            'Scores': [
-                ('s0', self.scores_list[0]),
-                ('s1', self.scores_list[1]),
-                ('s2', self.scores_list[2]),
-                ('s3', self.scores_list[3]),
-                ('s4', self.scores_list[4]),
-            ],
-            'Anchors': [
-                ('a0', self.anchors_list[0]),
-                ('a1', self.anchors_list[1]),
-                ('a2', self.anchors_list[2]),
-                ('a3', self.anchors_list[3]),
-                ('a4', self.anchors_list[4]),
-            ],
-            'ImInfo': (
-                self.im_info,
-                [
-                    [
-                        1,
-                    ]
-                ],
-            ),
-        }
-        self.outputs = {'Out': (nmsed_outs, [lod])}
-        self.attrs = {
-            'score_threshold': self.score_threshold,
-            'nms_top_k': self.nms_top_k,
-            'nms_threshold': self.nms_threshold,
-            'keep_top_k': self.keep_top_k,
-            'nms_eta': 1.0,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestRetinanetDetectionOutOp2(OpTest):
-    def set_argument(self):
-        self.score_threshold = 0.05
-        self.min_level = 3
-        self.max_level = 7
-        self.nms_threshold = 0.3
-        self.nms_top_k = 1000
-        self.keep_top_k = 200
-
-        self.scales_per_octave = 3
-        self.aspect_ratios = [1.0, 2.0, 0.5]
-        self.anchor_scale = 4
-        self.anchor_strides = [8, 16, 32, 64, 128]
-
-        self.box_size = 4
-        self.class_num = 80
-        self.batch_size = 1
-        self.input_channels = 20
-        # Here test the case there the shape of each FPN level
-        # is irrelevant.
-        self.layer_h = [1, 4, 8, 8, 16]
-        self.layer_w = [1, 4, 8, 8, 16]
-
-
-class TestRetinanetDetectionOutOpNo3(TestRetinanetDetectionOutOp1):
-    def set_argument(self):
-        # Here set 2.0 to test the case there is no outputs.
-        # In practical use, 0.0 < score_threshold < 1.0
-        self.score_threshold = 2.0
-        self.min_level = 3
-        self.max_level = 7
-        self.nms_threshold = 0.3
-        self.nms_top_k = 1000
-        self.keep_top_k = 200
-
-        self.scales_per_octave = 3
-        self.aspect_ratios = [1.0, 2.0, 0.5]
-        self.anchor_scale = 4
-        self.anchor_strides = [8, 16, 32, 64, 128]
-
-        self.box_size = 4
-        self.class_num = 80
-        self.batch_size = 1
-        self.input_channels = 20
-
-        self.layer_h = []
-        self.layer_w = []
-        num_levels = self.max_level - self.min_level + 1
-        for i in range(num_levels):
-            self.layer_h.append(2 ** (num_levels - i))
-            self.layer_w.append(2 ** (num_levels - i))
-
-
-class TestRetinanetDetectionOutOpNo4(TestRetinanetDetectionOutOp1):
-    def set_argument(self):
-        self.score_threshold = 0.05
-        self.min_level = 2
-        self.max_level = 5
-        self.nms_threshold = 0.3
-        self.nms_top_k = 1000
-        self.keep_top_k = 200
-
-        self.scales_per_octave = 3
-        self.aspect_ratios = [1.0, 2.0, 0.5]
-        self.anchor_scale = 4
-        self.anchor_strides = [8, 16, 32, 64, 128]
-
-        self.box_size = 4
-        self.class_num = 80
-        self.batch_size = 1
-        self.input_channels = 20
-
-        self.layer_h = []
-        self.layer_w = []
-        num_levels = self.max_level - self.min_level + 1
-        for i in range(num_levels):
-            self.layer_h.append(2 ** (num_levels - i))
-            self.layer_w.append(2 ** (num_levels - i))
-
-    def setUp(self):
-        self.set_argument()
-        self.init_test_input()
-
-        nmsed_outs, lod = batched_retinanet_detection_out(
-            self.bboxes_list,
-            self.scores_list,
-            self.anchors_list,
-            self.im_info,
-            self.score_threshold,
-            self.nms_threshold,
-            self.nms_top_k,
-            self.keep_top_k,
-        )
-        nmsed_outs = np.array(nmsed_outs).astype('float32')
-        self.op_type = 'retinanet_detection_output'
-        self.inputs = {
-            'BBoxes': [
-                ('b0', self.bboxes_list[0]),
-                ('b1', self.bboxes_list[1]),
-                ('b2', self.bboxes_list[2]),
-                ('b3', self.bboxes_list[3]),
-            ],
-            'Scores': [
-                ('s0', self.scores_list[0]),
-                ('s1', self.scores_list[1]),
-                ('s2', self.scores_list[2]),
-                ('s3', self.scores_list[3]),
-            ],
-            'Anchors': [
-                ('a0', self.anchors_list[0]),
-                ('a1', self.anchors_list[1]),
-                ('a2', self.anchors_list[2]),
-                ('a3', self.anchors_list[3]),
-            ],
-            'ImInfo': (
-                self.im_info,
-                [
-                    [
-                        1,
-                    ]
-                ],
-            ),
-        }
-        self.outputs = {'Out': (nmsed_outs, [lod])}
-        self.attrs = {
-            'score_threshold': self.score_threshold,
-            'nms_top_k': self.nms_top_k,
-            'nms_threshold': self.nms_threshold,
-            'keep_top_k': self.keep_top_k,
-            'nms_eta': 1.0,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestRetinanetDetectionOutOpNo5(TestRetinanetDetectionOutOp1):
-    def set_argument(self):
-        self.score_threshold = 0.05
-        self.min_level = 3
-        self.max_level = 7
-        self.nms_threshold = 0.3
-        self.nms_top_k = 100
-        self.keep_top_k = 10
-
-        self.scales_per_octave = 3
-        self.aspect_ratios = [1.0, 2.0, 0.5]
-        self.anchor_scale = 4
-        self.anchor_strides = [8, 16, 32, 64, 128]
-
-        self.box_size = 4
-        self.class_num = 80
-        self.batch_size = 1
-        self.input_channels = 20
-
-        self.layer_h = []
-        self.layer_w = []
-        num_levels = self.max_level - self.min_level + 1
-        for i in range(num_levels):
-            self.layer_h.append(2 ** (num_levels - i))
-            self.layer_w.append(2 ** (num_levels - i))
-
-
-if __name__ == '__main__':
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_rpn_target_assign_op.py b/test/legacy_test/test_rpn_target_assign_op.py
deleted file mode 100644
index d0147d8b700f1..0000000000000
--- a/test/legacy_test/test_rpn_target_assign_op.py
+++ /dev/null
@@ -1,486 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-from test_anchor_generator_op import anchor_generator_in_python
-from test_generate_proposal_labels_op import (
-    _bbox_overlaps,
-    _box_to_delta,
-    _generate_groundtruth,
-)
-
-
-def rpn_target_assign(
-    anchor_by_gt_overlap,
-    rpn_batch_size_per_im,
-    rpn_positive_overlap,
-    rpn_negative_overlap,
-    rpn_fg_fraction,
-    use_random=True,
-):
-    anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
-    anchor_to_gt_max = anchor_by_gt_overlap[
-        np.arange(anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax
-    ]
-
-    gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
-    gt_to_anchor_max = anchor_by_gt_overlap[
-        gt_to_anchor_argmax, np.arange(anchor_by_gt_overlap.shape[1])
-    ]
-    anchors_with_max_overlap = np.where(
-        anchor_by_gt_overlap == gt_to_anchor_max
-    )[0]
-
-    labels = np.ones((anchor_by_gt_overlap.shape[0],), dtype=np.int32) * -1
-    labels[anchors_with_max_overlap] = 1
-    labels[anchor_to_gt_max >= rpn_positive_overlap] = 1
-
-    num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im)
-    fg_inds = np.where(labels == 1)[0]
-    if len(fg_inds) > num_fg and use_random:
-        disable_inds = np.random.choice(
-            fg_inds, size=(len(fg_inds) - num_fg), replace=False
-        )
-    else:
-        disable_inds = fg_inds[num_fg:]
-
-    labels[disable_inds] = -1
-    fg_inds = np.where(labels == 1)[0]
-    bbox_inside_weight = np.zeros((len(fg_inds), 4), dtype=np.float32)
-
-    num_bg = rpn_batch_size_per_im - np.sum(labels == 1)
-    bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
-    if len(bg_inds) > num_bg and use_random:
-        enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
-    else:
-        enable_inds = bg_inds[:num_bg]
-
-    fg_fake_inds = np.array([], np.int32)
-    fg_value = np.array([fg_inds[0]], np.int32)
-    fake_num = 0
-    for bg_id in enable_inds:
-        if bg_id in fg_inds:
-            fake_num += 1
-            fg_fake_inds = np.hstack([fg_fake_inds, fg_value])
-    labels[enable_inds] = 0
-
-    bbox_inside_weight[fake_num:, :] = 1
-    fg_inds = np.where(labels == 1)[0]
-    bg_inds = np.where(labels == 0)[0]
-    loc_index = np.hstack([fg_fake_inds, fg_inds])
-    score_index = np.hstack([fg_inds, bg_inds])
-    labels = labels[score_index]
-    assert not np.any(labels == -1), "Wrong labels with -1"
-
-    gt_inds = anchor_to_gt_argmax[loc_index]
-
-    return loc_index, score_index, labels, gt_inds, bbox_inside_weight
-
-
-def get_anchor(n, c, h, w):
-    input_feat = np.random.random((n, c, h, w)).astype('float32')
-    anchors, _ = anchor_generator_in_python(
-        input_feat=input_feat,
-        anchor_sizes=[32.0, 64.0],
-        aspect_ratios=[0.5, 1.0],
-        variances=[1.0, 1.0, 1.0, 1.0],
-        stride=[16.0, 16.0],
-        offset=0.5,
-    )
-    return anchors
-
-
-def rpn_target_assign_in_python(
-    all_anchors,
-    gt_boxes,
-    is_crowd,
-    im_info,
-    lod,
-    rpn_straddle_thresh,
-    rpn_batch_size_per_im,
-    rpn_positive_overlap,
-    rpn_negative_overlap,
-    rpn_fg_fraction,
-    use_random=True,
-):
-    anchor_num = all_anchors.shape[0]
-    batch_size = len(lod) - 1
-    for i in range(batch_size):
-        im_height = im_info[i][0]
-        im_width = im_info[i][1]
-        im_scale = im_info[i][2]
-        if rpn_straddle_thresh >= 0:
-            # Only keep anchors inside the image by a margin of straddle_thresh
-            inds_inside = np.where(
-                (all_anchors[:, 0] >= -rpn_straddle_thresh)
-                & (all_anchors[:, 1] >= -rpn_straddle_thresh)
-                & (all_anchors[:, 2] < im_width + rpn_straddle_thresh)
-                & (all_anchors[:, 3] < im_height + rpn_straddle_thresh)
-            )[0]
-            # keep only inside anchors
-            inside_anchors = all_anchors[inds_inside, :]
-        else:
-            inds_inside = np.arange(all_anchors.shape[0])
-            inside_anchors = all_anchors
-
-        b, e = lod[i], lod[i + 1]
-        gt_boxes_slice = gt_boxes[b:e, :] * im_scale
-        is_crowd_slice = is_crowd[b:e]
-
-        not_crowd_inds = np.where(is_crowd_slice == 0)[0]
-        gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
-        iou = _bbox_overlaps(inside_anchors, gt_boxes_slice)
-
-        (
-            loc_inds,
-            score_inds,
-            labels,
-            gt_inds,
-            bbox_inside_weight,
-        ) = rpn_target_assign(
-            iou,
-            rpn_batch_size_per_im,
-            rpn_positive_overlap,
-            rpn_negative_overlap,
-            rpn_fg_fraction,
-            use_random,
-        )
-        # unmap to all anchor
-        loc_inds = inds_inside[loc_inds]
-        score_inds = inds_inside[score_inds]
-
-        sampled_gt = gt_boxes_slice[gt_inds]
-        sampled_anchor = all_anchors[loc_inds]
-        box_deltas = _box_to_delta(
-            sampled_anchor, sampled_gt, [1.0, 1.0, 1.0, 1.0]
-        )
-
-        if i == 0:
-            loc_indexes = loc_inds
-            score_indexes = score_inds
-            tgt_labels = labels
-            tgt_bboxes = box_deltas
-            bbox_inside_weights = bbox_inside_weight
-        else:
-            loc_indexes = np.concatenate(
-                [loc_indexes, loc_inds + i * anchor_num]
-            )
-            score_indexes = np.concatenate(
-                [score_indexes, score_inds + i * anchor_num]
-            )
-            tgt_labels = np.concatenate([tgt_labels, labels])
-            tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
-            bbox_inside_weights = np.vstack(
-                [bbox_inside_weights, bbox_inside_weight]
-            )
-
-    return (
-        loc_indexes,
-        score_indexes,
-        tgt_bboxes,
-        tgt_labels,
-        bbox_inside_weights,
-    )
-
-
-def retinanet_target_assign(
-    anchor_by_gt_overlap, gt_labels, positive_overlap, negative_overlap
-):
-    anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
-    anchor_to_gt_max = anchor_by_gt_overlap[
-        np.arange(anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax
-    ]
-
-    gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
-    gt_to_anchor_max = anchor_by_gt_overlap[
-        gt_to_anchor_argmax, np.arange(anchor_by_gt_overlap.shape[1])
-    ]
-    anchors_with_max_overlap = np.where(
-        anchor_by_gt_overlap == gt_to_anchor_max
-    )[0]
-
-    labels = np.ones((anchor_by_gt_overlap.shape[0],), dtype=np.int32) * -1
-    labels[anchors_with_max_overlap] = 1
-    labels[anchor_to_gt_max >= positive_overlap] = 1
-
-    fg_inds = np.where(labels == 1)[0]
-    bbox_inside_weight = np.zeros((len(fg_inds), 4), dtype=np.float32)
-
-    bg_inds = np.where(anchor_to_gt_max < negative_overlap)[0]
-    enable_inds = bg_inds
-
-    fg_fake_inds = np.array([], np.int32)
-    fg_value = np.array([fg_inds[0]], np.int32)
-    fake_num = 0
-    for bg_id in enable_inds:
-        if bg_id in fg_inds:
-            fake_num += 1
-            fg_fake_inds = np.hstack([fg_fake_inds, fg_value])
-    labels[enable_inds] = 0
-
-    bbox_inside_weight[fake_num:, :] = 1
-    fg_inds = np.where(labels == 1)[0]
-    bg_inds = np.where(labels == 0)[0]
-    loc_index = np.hstack([fg_fake_inds, fg_inds])
-    score_index = np.hstack([fg_inds, bg_inds])
-    score_index_tmp = np.hstack([fg_inds])
-    labels = labels[score_index]
-
-    gt_inds = anchor_to_gt_argmax[loc_index]
-    label_inds = anchor_to_gt_argmax[score_index_tmp]
-    labels[0 : len(fg_inds)] = np.squeeze(gt_labels[label_inds])
-    fg_num = len(fg_fake_inds) + len(fg_inds) + 1
-    assert not np.any(labels == -1), "Wrong labels with -1"
-    return loc_index, score_index, labels, gt_inds, bbox_inside_weight, fg_num
-
-
-def retinanet_target_assign_in_python(
-    all_anchors,
-    gt_boxes,
-    gt_labels,
-    is_crowd,
-    im_info,
-    lod,
-    positive_overlap,
-    negative_overlap,
-):
-    anchor_num = all_anchors.shape[0]
-    batch_size = len(lod) - 1
-    for i in range(batch_size):
-        im_scale = im_info[i][2]
-
-        inds_inside = np.arange(all_anchors.shape[0])
-        inside_anchors = all_anchors
-        b, e = lod[i], lod[i + 1]
-        gt_boxes_slice = gt_boxes[b:e, :] * im_scale
-        gt_labels_slice = gt_labels[b:e, :]
-        is_crowd_slice = is_crowd[b:e]
-
-        not_crowd_inds = np.where(is_crowd_slice == 0)[0]
-        gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
-        gt_labels_slice = gt_labels_slice[not_crowd_inds]
-        iou = _bbox_overlaps(inside_anchors, gt_boxes_slice)
-
-        (
-            loc_inds,
-            score_inds,
-            labels,
-            gt_inds,
-            bbox_inside_weight,
-            fg_num,
-        ) = retinanet_target_assign(
-            iou, gt_labels_slice, positive_overlap, negative_overlap
-        )
-        # unmap to all anchor
-        loc_inds = inds_inside[loc_inds]
-        score_inds = inds_inside[score_inds]
-
-        sampled_gt = gt_boxes_slice[gt_inds]
-        sampled_anchor = all_anchors[loc_inds]
-        box_deltas = _box_to_delta(
-            sampled_anchor, sampled_gt, [1.0, 1.0, 1.0, 1.0]
-        )
-
-        if i == 0:
-            loc_indexes = loc_inds
-            score_indexes = score_inds
-            tgt_labels = labels
-            tgt_bboxes = box_deltas
-            bbox_inside_weights = bbox_inside_weight
-            fg_nums = [[fg_num]]
-        else:
-            loc_indexes = np.concatenate(
-                [loc_indexes, loc_inds + i * anchor_num]
-            )
-            score_indexes = np.concatenate(
-                [score_indexes, score_inds + i * anchor_num]
-            )
-            tgt_labels = np.concatenate([tgt_labels, labels])
-            tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
-            bbox_inside_weights = np.vstack(
-                [bbox_inside_weights, bbox_inside_weight]
-            )
-            fg_nums = np.concatenate([fg_nums, [[fg_num]]])
-
-    return (
-        loc_indexes,
-        score_indexes,
-        tgt_bboxes,
-        tgt_labels,
-        bbox_inside_weights,
-        fg_nums,
-    )
-
-
-class TestRpnTargetAssignOp(OpTest):
-    def setUp(self):
-        n, c, h, w = 2, 4, 14, 14
-        all_anchors = get_anchor(n, c, h, w)
-        gt_num = 10
-        all_anchors = all_anchors.reshape(-1, 4)
-        anchor_num = all_anchors.shape[0]
-
-        images_shape = [[64, 64], [64, 64]]
-        # images_shape = [[64, 64]]
-        groundtruth, lod = _generate_groundtruth(images_shape, 3, 4)
-        lod = [0, 4, 8]
-        # lod = [0, 4]
-
-        im_info = np.ones((len(images_shape), 3)).astype(np.float32)
-        for i in range(len(images_shape)):
-            im_info[i, 0] = images_shape[i][0]
-            im_info[i, 1] = images_shape[i][1]
-            im_info[i, 2] = 0.8  # scale
-        gt_boxes = np.vstack([v['boxes'] for v in groundtruth])
-        is_crowd = np.hstack([v['is_crowd'] for v in groundtruth])
-
-        all_anchors = all_anchors.astype('float32')
-        gt_boxes = gt_boxes.astype('float32')
-
-        rpn_straddle_thresh = 0.0
-        rpn_batch_size_per_im = 256
-        rpn_positive_overlap = 0.7
-        rpn_negative_overlap = 0.3
-        rpn_fg_fraction = 0.5
-        use_random = False
-
-        (
-            loc_index,
-            score_index,
-            tgt_bbox,
-            labels,
-            bbox_inside_weights,
-        ) = rpn_target_assign_in_python(
-            all_anchors,
-            gt_boxes,
-            is_crowd,
-            im_info,
-            lod,
-            rpn_straddle_thresh,
-            rpn_batch_size_per_im,
-            rpn_positive_overlap,
-            rpn_negative_overlap,
-            rpn_fg_fraction,
-            use_random,
-        )
-        labels = labels[:, np.newaxis]
-
-        self.op_type = "rpn_target_assign"
-        self.inputs = {
-            'Anchor': all_anchors,
-            'GtBoxes': (gt_boxes, [[4, 4]]),
-            'IsCrowd': (is_crowd, [[4, 4]]),
-            'ImInfo': (im_info, [[1, 1]]),
-        }
-        self.attrs = {
-            'rpn_batch_size_per_im': rpn_batch_size_per_im,
-            'rpn_straddle_thresh': rpn_straddle_thresh,
-            'rpn_positive_overlap': rpn_positive_overlap,
-            'rpn_negative_overlap': rpn_negative_overlap,
-            'rpn_fg_fraction': rpn_fg_fraction,
-            'use_random': use_random,
-        }
-        self.outputs = {
-            'LocationIndex': loc_index.astype('int32'),
-            'ScoreIndex': score_index.astype('int32'),
-            'TargetBBox': tgt_bbox.astype('float32'),
-            'TargetLabel': labels.astype('int32'),
-            'BBoxInsideWeight': bbox_inside_weights.astype('float32'),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestRetinanetTargetAssignOp(OpTest):
-    def setUp(self):
-        n, c, h, w = 2, 4, 14, 14
-        all_anchors = get_anchor(n, c, h, w)
-        gt_num = 10
-        all_anchors = all_anchors.reshape(-1, 4)
-        anchor_num = all_anchors.shape[0]
-
-        images_shape = [[64, 64], [64, 64]]
-        groundtruth, lod = _generate_groundtruth(images_shape, 3, 4)
-        lod = [0, 4, 8]
-
-        im_info = np.ones((len(images_shape), 3)).astype(np.float32)
-        for i in range(len(images_shape)):
-            im_info[i, 0] = images_shape[i][0]
-            im_info[i, 1] = images_shape[i][1]
-            im_info[i, 2] = 0.8  # scale
-        gt_boxes = np.vstack([v['boxes'] for v in groundtruth])
-        is_crowd = np.hstack([v['is_crowd'] for v in groundtruth])
-        gt_labels = np.vstack(
-            [
-                v['gt_classes'].reshape(len(v['gt_classes']), 1)
-                for v in groundtruth
-            ]
-        )
-        gt_labels = gt_labels.reshape(len(gt_labels), 1)
-        all_anchors = all_anchors.astype('float32')
-        gt_boxes = gt_boxes.astype('float32')
-        gt_labels = gt_labels.astype('int32')
-
-        positive_overlap = 0.5
-        negative_overlap = 0.4
-
-        (
-            loc_index,
-            score_index,
-            tgt_bbox,
-            labels,
-            bbox_inside_weights,
-            fg_num,
-        ) = retinanet_target_assign_in_python(
-            all_anchors,
-            gt_boxes,
-            gt_labels,
-            is_crowd,
-            im_info,
-            lod,
-            positive_overlap,
-            negative_overlap,
-        )
-        labels = labels[:, np.newaxis]
-        self.op_type = "retinanet_target_assign"
-        self.inputs = {
-            'Anchor': all_anchors,
-            'GtBoxes': (gt_boxes, [[4, 4]]),
-            'GtLabels': (gt_labels, [[4, 4]]),
-            'IsCrowd': (is_crowd, [[4, 4]]),
-            'ImInfo': (im_info, [[1, 1]]),
-        }
-        self.attrs = {
-            'positive_overlap': positive_overlap,
-            'negative_overlap': negative_overlap,
-        }
-        self.outputs = {
-            'LocationIndex': loc_index.astype('int32'),
-            'ScoreIndex': score_index.astype('int32'),
-            'TargetBBox': tgt_bbox.astype('float32'),
-            'TargetLabel': labels.astype('int32'),
-            'BBoxInsideWeight': bbox_inside_weights.astype('float32'),
-            'ForegroundNumber': fg_num.astype('int32'),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_similarity_focus_op.py b/test/legacy_test/test_similarity_focus_op.py
deleted file mode 100755
index 1227a48949341..0000000000000
--- a/test/legacy_test/test_similarity_focus_op.py
+++ /dev/null
@@ -1,232 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestSimilarityFocusOp(OpTest):
-    def setUp(self):
-        self.op_type = "similarity_focus"
-        batch_size = 2
-        x_dim, y_dim, z_dim = 3, 2, 2
-        self.inputs = {
-            'X': np.array(
-                [
-                    [
-                        [[0.8, 0.1], [0.4, 0.5]],
-                        [[0.9, 0.7], [0.9, 0.9]],
-                        [[0.8, 0.9], [0.1, 0.2]],
-                    ],
-                    [
-                        [[0.2, 0.5], [0.3, 0.4]],
-                        [[0.9, 0.7], [0.8, 0.4]],
-                        [[0.0, 0.2], [0.4, 0.7]],
-                    ],
-                ]
-            ),
-        }
-        self.attrs = {
-            'axis': 1,
-            'indexes': [0],
-        }
-
-        output = None
-        for batch in range(batch_size):
-            res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1)
-            for index in self.attrs['indexes']:
-                channel = (
-                    self.inputs['X'][batch, index, :, :].reshape(-1).copy()
-                )
-                tag1 = [0 for i in range(y_dim)]
-                tag2 = [0 for i in range(z_dim)]
-                cnt = 0
-                for i in range(channel.size):
-                    index = channel.argmax()
-                    idx1 = index // z_dim
-                    idx2 = index % z_dim
-                    if tag1[idx1] + tag2[idx2] == 0:
-                        tag1[idx1] = 1
-                        tag2[idx2] = 1
-                        res[index] = 1
-                        cnt += 1
-                        if cnt == min(y_dim, z_dim):
-                            break
-                    channel[index] = -1
-            res = res.reshape(1, y_dim, z_dim).repeat([x_dim], axis=0)
-            res = res.reshape(1, x_dim, y_dim, z_dim)
-            if output is not None:
-                output = np.concatenate((output, res), axis=0)
-            else:
-                output = res
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSimilarityFocusOp_axis1(OpTest):
-    def setUp(self):
-        self.op_type = "similarity_focus"
-        batch_size = 3
-        x_dim, y_dim, z_dim = 4, 5, 6
-        self.inputs = {
-            'X': np.random.random((batch_size, x_dim, y_dim, z_dim)).astype(
-                "float32"
-            ),
-        }
-        self.attrs = {
-            'axis': 1,
-            'indexes': [0, 3],
-        }
-
-        output = None
-        for batch in range(batch_size):
-            res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1)
-            for index in self.attrs['indexes']:
-                channel = (
-                    self.inputs['X'][batch, index, :, :].reshape(-1).copy()
-                )
-                tag1 = [0 for i in range(y_dim)]
-                tag2 = [0 for i in range(z_dim)]
-                cnt = 0
-                for i in range(channel.size):
-                    index = channel.argmax()
-                    idx1 = index // z_dim
-                    idx2 = index % z_dim
-                    if tag1[idx1] + tag2[idx2] == 0:
-                        tag1[idx1] = 1
-                        tag2[idx2] = 1
-                        res[index] = 1
-                        cnt += 1
-                        if cnt == min(y_dim, z_dim):
-                            break
-                    channel[index] = -1
-            res = res.reshape(1, y_dim, z_dim)
-            res = res.repeat([x_dim], axis=0)
-            res = res.reshape(1, x_dim, y_dim, z_dim)
-            if output is not None:
-                output = np.concatenate((output, res), axis=0)
-            else:
-                output = res
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSimilarityFocusOp_axis2(OpTest):
-    def setUp(self):
-        self.op_type = "similarity_focus"
-        batch_size = 6
-        x_dim, y_dim, z_dim = 7, 8, 9
-        self.inputs = {
-            'X': np.random.random((batch_size, x_dim, y_dim, z_dim)).astype(
-                "float32"
-            ),
-        }
-        self.attrs = {
-            'axis': 2,
-            'indexes': [0, 3, 5],
-        }
-
-        output = None
-        for batch in range(batch_size):
-            res = np.zeros((x_dim, 1, z_dim)).astype("float32").reshape(-1)
-            for index in self.attrs['indexes']:
-                channel = (
-                    self.inputs['X'][batch, :, index, :].reshape(-1).copy()
-                )
-                tag1 = [0 for i in range(x_dim)]
-                tag2 = [0 for i in range(z_dim)]
-                cnt = 0
-                for i in range(channel.size):
-                    index = channel.argmax()
-                    idx1 = index // z_dim
-                    idx2 = index % z_dim
-                    if tag1[idx1] + tag2[idx2] == 0:
-                        tag1[idx1] = 1
-                        tag2[idx2] = 1
-                        res[index] = 1
-                        cnt += 1
-                        if cnt == min(x_dim, z_dim):
-                            break
-                    channel[index] = -1
-            res = res.reshape(x_dim, 1, z_dim)
-            res = res.repeat([y_dim], axis=1)
-            res = res.reshape(1, x_dim, y_dim, z_dim)
-            if output is not None:
-                output = np.concatenate((output, res), axis=0)
-            else:
-                output = res
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSimilarityFocusOp_axis3(OpTest):
-    def setUp(self):
-        self.op_type = "similarity_focus"
-        batch_size = 64
-        x_dim, y_dim, z_dim = 48, 48, 13
-        self.inputs = {
-            'X': np.random.random((batch_size, x_dim, y_dim, z_dim)).astype(
-                "float32"
-            ),
-        }
-        self.attrs = {
-            'axis': 3,
-            'indexes': [0, 2, 7, 9],
-        }
-
-        output = None
-        for batch in range(batch_size):
-            res = np.zeros((x_dim, y_dim, 1)).astype("float32").reshape(-1)
-            for index in self.attrs['indexes']:
-                channel = (
-                    self.inputs['X'][batch, :, :, index].reshape(-1).copy()
-                )
-                tag1 = [0 for i in range(x_dim)]
-                tag2 = [0 for i in range(y_dim)]
-                cnt = 0
-                for i in range(channel.size):
-                    index = channel.argmax()
-                    idx1 = index // y_dim
-                    idx2 = index % y_dim
-                    if tag1[idx1] + tag2[idx2] == 0:
-                        tag1[idx1] = 1
-                        tag2[idx2] = 1
-                        res[index] = 1
-                        cnt += 1
-                        if cnt == min(x_dim, y_dim):
-                            break
-                    channel[index] = -1
-            res = res.reshape(x_dim, y_dim, 1)
-            res = res.repeat([z_dim], axis=2)
-            res = res.reshape(1, x_dim, y_dim, z_dim)
-            if output is not None:
-                output = np.concatenate((output, res), axis=0)
-            else:
-                output = res
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_spp_op.py b/test/legacy_test/test_spp_op.py
deleted file mode 100644
index fbf3440352590..0000000000000
--- a/test/legacy_test/test_spp_op.py
+++ /dev/null
@@ -1,91 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-from test_pool2d_op import avg_pool2D_forward_naive, max_pool2D_forward_naive
-
-
-class TestSppOp(OpTest):
-    def setUp(self):
-        self.op_type = "spp"
-        self.init_test_case()
-        nsize, csize, hsize, wsize = self.shape
-        data = np.array(list(range(nsize * csize * hsize * wsize)))
-        input = data.reshape(self.shape)
-        input_random = np.random.random(self.shape).astype("float64")
-        input = input + input_random
-        out_level_flatten = []
-        for i in range(self.pyramid_height):
-            bins = np.power(2, i)
-            kernel_size = [0, 0]
-            padding = [0, 0]
-            kernel_size[0] = np.ceil(hsize / bins.astype("double")).astype(
-                "int32"
-            )
-            padding[0] = ((kernel_size[0] * bins - hsize + 1) / 2).astype(
-                "int32"
-            )
-
-            kernel_size[1] = np.ceil(wsize / bins.astype("double")).astype(
-                "int32"
-            )
-            padding[1] = ((kernel_size[1] * bins - wsize + 1) / 2).astype(
-                "int32"
-            )
-            out_level = self.pool2D_forward_naive(
-                input, kernel_size, kernel_size, padding
-            )
-            out_level_flatten.append(
-                out_level.reshape(nsize, bins * bins * csize)
-            )
-            if i == 0:
-                output = out_level_flatten[i]
-            else:
-                output = np.concatenate((output, out_level_flatten[i]), 1)
-        # output = np.concatenate(out_level_flatten.tolist(), 0);
-        self.inputs = {
-            'X': input.astype('float64'),
-        }
-        self.attrs = {
-            'pyramid_height': self.pyramid_height,
-            'pooling_type': self.pool_type,
-        }
-        self.outputs = {'Out': output.astype('float64')}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-    def init_test_case(self):
-        self.shape = [3, 2, 16, 16]
-        self.pyramid_height = 3
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-        self.pool_type = "max"
-
-
-class TestCase2(TestSppOp):
-    def init_test_case(self):
-        self.shape = [3, 2, 16, 16]
-        self.pyramid_height = 3
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
-        self.pool_type = "avg"
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_tdm_child_op.py b/test/legacy_test/test_tdm_child_op.py
deleted file mode 100644
index b1c100a2a789f..0000000000000
--- a/test/legacy_test/test_tdm_child_op.py
+++ /dev/null
@@ -1,186 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest, paddle_static_guard
-
-import paddle
-from paddle import base
-from paddle.incubate.layers.nn import tdm_child
-
-
-def create_tdm_tree():
-    """Create tdm tree info"""
-    tree_info = [
-        [0, 0, 0, 1, 2],
-        [0, 1, 0, 3, 4],
-        [0, 1, 0, 5, 6],
-        [0, 2, 1, 7, 8],
-        [0, 2, 1, 9, 10],
-        [0, 2, 2, 11, 12],
-        [0, 2, 2, 13, 0],
-        [0, 3, 3, 14, 15],
-        [0, 3, 3, 16, 17],
-        [0, 3, 4, 18, 19],
-        [0, 3, 4, 20, 21],
-        [0, 3, 5, 22, 23],
-        [0, 3, 5, 24, 25],
-        [12, 3, 6, 0, 0],
-        [0, 4, 7, 0, 0],
-        [1, 4, 7, 0, 0],
-        [2, 4, 8, 0, 0],
-        [3, 4, 8, 0, 0],
-        [4, 4, 9, 0, 0],
-        [5, 4, 9, 0, 0],
-        [6, 4, 10, 0, 0],
-        [7, 4, 10, 0, 0],
-        [8, 4, 11, 0, 0],
-        [9, 4, 11, 0, 0],
-        [10, 4, 12, 0, 0],
-        [11, 4, 12, 0, 0],
-    ]
-    return tree_info
-
-
-class TestTDMChildOp(OpTest):
-    def setUp(self):
-        self.__class__.op_type = "tdm_child"
-        self.config()
-        tree_info = create_tdm_tree()
-        tree_info_np = np.array(tree_info).astype(self.info_type)
-
-        x_np = np.random.randint(low=0, high=26, size=self.x_shape).astype(
-            self.x_type
-        )
-        children_res = []
-        leaf_mask_res = []
-        for batch in x_np:
-            for node in batch:
-                children = []
-                if node != 0:
-                    children.append(tree_info[node][3])
-                    children.append(tree_info[node][4])
-                else:
-                    children.append(0)
-                    children.append(0)
-                mask = []
-                for child in children:
-                    m = int(tree_info[child][0] != 0)
-                    mask.append(m)
-                children_res += children
-                leaf_mask_res += mask
-        children_res_np = np.array(children_res).astype(self.info_type)
-        leaf_mask_res_np = np.array(leaf_mask_res).astype(self.info_type)
-
-        child = np.reshape(children_res_np, self.child_shape)
-        leaf_mask = np.reshape(leaf_mask_res_np, self.child_shape)
-
-        self.attrs = {'child_nums': 2}
-        self.inputs = {'X': x_np, 'TreeInfo': tree_info_np}
-        self.outputs = {'Child': child, 'LeafMask': leaf_mask}
-
-    def config(self):
-        """set test shape & type"""
-        self.x_shape = (10, 20)
-        self.child_shape = (10, 20, 2)
-        self.x_type = 'int32'
-        self.info_type = 'int32'
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestCase1(TestTDMChildOp):
-    def config(self):
-        """check int int64_t"""
-        self.x_shape = (10, 20)
-        self.child_shape = (10, 20, 2)
-        self.x_type = 'int32'
-        self.info_type = 'int64'
-
-
-class TestCase2(TestTDMChildOp):
-    def config(self):
-        """check int64_t int64_t"""
-        self.x_shape = (10, 20)
-        self.child_shape = (10, 20, 2)
-        self.x_type = 'int64'
-        self.info_type = 'int64'
-
-
-class TestCase3(TestTDMChildOp):
-    def config(self):
-        """check int64 int32"""
-        self.x_shape = (10, 20)
-        self.child_shape = (10, 20, 2)
-        self.x_type = 'int64'
-        self.info_type = 'int32'
-
-
-class TestCase4(TestTDMChildOp):
-    def config(self):
-        """check large shape"""
-        self.x_shape = (100, 20)
-        self.child_shape = (100, 20, 2)
-        self.x_type = 'int32'
-        self.info_type = 'int32'
-
-
-class TestTDMChildShape(unittest.TestCase):
-    def test_shape(self):
-        with paddle_static_guard():
-            x = paddle.static.data(
-                name='x', shape=[-1, 1], dtype='int32', lod_level=1
-            )
-            tdm_tree_info = create_tdm_tree()
-            tree_info_np = np.array(tdm_tree_info).astype('int32')
-
-            child, leaf_mask = tdm_child(
-                x=x,
-                node_nums=26,
-                child_nums=2,
-                param_attr=base.ParamAttr(
-                    initializer=paddle.nn.initializer.Assign(tree_info_np)
-                ),
-            )
-
-            place = base.CPUPlace()
-            exe = base.Executor(place=place)
-            exe.run(base.default_startup_program())
-
-            feed = {
-                'x': np.array(
-                    [
-                        [1],
-                        [2],
-                        [3],
-                        [4],
-                        [5],
-                        [6],
-                        [7],
-                        [8],
-                        [9],
-                        [10],
-                        [11],
-                        [12],
-                    ]
-                ).astype('int32')
-            }
-            exe.run(feed=feed)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_teacher_student_sigmoid_loss_op.py b/test/legacy_test/test_teacher_student_sigmoid_loss_op.py
deleted file mode 100644
index 984a47831064e..0000000000000
--- a/test/legacy_test/test_teacher_student_sigmoid_loss_op.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from math import exp, log
-
-import numpy as np
-from op_test import OpTest
-from scipy.special import logit
-
-
-class TestTeacherStudentSigmoidLossOp(OpTest):
-    """
-    Test teacher_student_sigmoid_loss with discrete one-hot labels.
-    """
-
-    def setUp(self):
-        self.op_type = "teacher_student_sigmoid_loss"
-        batch_size = 100
-        num_classes = 1
-        self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, (batch_size, num_classes)).astype(
-                    "float64"
-                )
-            ),
-            'Label': np.random.uniform(0, 2, (batch_size, num_classes)).astype(
-                "float64"
-            ),
-        }
-        outs = []
-        for index, label in enumerate(self.inputs["Label"]):
-            x = self.inputs["X"][index]
-            if label < -1.0:
-                outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x))))
-            elif label < 0.0:
-                outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x))))
-            elif label < 1.0:
-                outs.append(
-                    max(x, 0.0)
-                    + log(1.0 + exp(-abs(x)))
-                    + max(x, 0.0)
-                    - x * label
-                    + log(1.0 + exp(-abs(x)))
-                )
-            else:
-                outs.append(
-                    max(x, 0.0)
-                    - x
-                    + log(1.0 + exp(-abs(x)))
-                    + max(x, 0.0)
-                    - x * (label - 1.0)
-                    + log(1.0 + exp(-abs(x)))
-                )
-        self.outputs = {'Y': np.array(outs)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Y", numeric_grad_delta=0.005)
diff --git a/test/legacy_test/test_unique_with_counts.py b/test/legacy_test/test_unique_with_counts.py
deleted file mode 100644
index 4cc2879bfab7a..0000000000000
--- a/test/legacy_test/test_unique_with_counts.py
+++ /dev/null
@@ -1,150 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest, paddle_static_guard
-
-import paddle
-from paddle.base import core
-
-
-class TestUniqueWithCountsOp(OpTest):
-    def setUp(self):
-        self.op_type = "unique_with_counts"
-        self.init_config()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def init_config(self):
-        self.inputs = {
-            'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64'),
-        }
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
-        self.outputs = {
-            'Out': np.array([2, 3, 1, 5], dtype='int64'),
-            'Index': np.array([0, 1, 1, 2, 3, 1], dtype='int32'),
-            'Count': np.array([1, 3, 1, 1], dtype='int32'),
-        }
-
-
-class TestOne(TestUniqueWithCountsOp):
-    def init_config(self):
-        self.inputs = {
-            'X': np.array([2], dtype='int64'),
-        }
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
-        self.outputs = {
-            'Out': np.array([2], dtype='int64'),
-            'Index': np.array([0], dtype='int32'),
-            'Count': np.array([1], dtype='int32'),
-        }
-
-
-class TestRandom(TestUniqueWithCountsOp):
-    def init_config(self):
-        input_data = np.random.randint(0, 100, (2000,), dtype='int64')
-        self.inputs = {'X': input_data}
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT64)}
-        np_unique, np_index, reverse_index = np.unique(
-            self.inputs['X'], True, True
-        )
-        np_tuple = [(np_unique[i], np_index[i]) for i in range(len(np_unique))]
-        np_tuple.sort(key=lambda x: x[1])
-        target_out = np.array([i[0] for i in np_tuple], dtype='int64')
-        target_index = np.array(
-            [list(target_out).index(i) for i in self.inputs['X']], dtype='int64'
-        )
-        count = [0 for i in range(len(np_unique))]
-        for i in range(target_index.shape[0]):
-            count[target_index[i]] += 1
-        target_count = np.array(count, dtype='int64')
-        self.outputs = {
-            'Out': target_out,
-            'Index': target_index,
-            'Count': target_count,
-        }
-
-
-class TestUniqueWithCountsRaiseError(unittest.TestCase):
-    def test_errors(self):
-        with paddle_static_guard():
-
-            def test_dtype():
-                data = paddle.static.data(
-                    shape=[10], dtype="int16", name="input"
-                )
-                paddle.unique(data)
-
-            self.assertRaises(TypeError, test_dtype)
-
-
-@unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-)
-class TestOneGPU(TestUniqueWithCountsOp):
-    def init_config(self):
-        self.inputs = {
-            'X': np.array([2], dtype='int64'),
-        }
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
-        self.outputs = {
-            'Out': np.array([2], dtype='int64'),
-            'Index': np.array([0], dtype='int32'),
-            'Count': np.array([1], dtype='int32'),
-        }
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-
-
-@unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-)
-class TestRandomGPU(TestUniqueWithCountsOp):
-    def init_config(self):
-        input_data = np.random.randint(0, 100, (2000,), dtype='int64')
-        self.inputs = {'X': input_data}
-        self.attrs = {'dtype': int(core.VarDesc.VarType.INT64)}
-        np_unique, np_index, reverse_index = np.unique(
-            self.inputs['X'], True, True
-        )
-        np_tuple = [(np_unique[i], np_index[i]) for i in range(len(np_unique))]
-        np_tuple.sort(key=lambda x: x[1])
-        target_out = np.array([i[0] for i in np_tuple], dtype='int64')
-        target_index = np.array(
-            [list(target_out).index(i) for i in self.inputs['X']], dtype='int64'
-        )
-        count = [0 for i in range(len(np_unique))]
-        for i in range(target_index.shape[0]):
-            count[target_index[i]] += 1
-        target_count = np.array(count, dtype='int64')
-        self.outputs = {
-            'Out': target_out,
-            'Index': target_index,
-            'Count': target_count,
-        }
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_unzip_op.py b/test/legacy_test/test_unzip_op.py
deleted file mode 100644
index fd564fe6f3578..0000000000000
--- a/test/legacy_test/test_unzip_op.py
+++ /dev/null
@@ -1,115 +0,0 @@
-#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-import paddle
-from paddle import base
-from paddle.base import core
-
-
-class TestUnzipOp(unittest.TestCase):
-    def test_result(self):
-        """
-        For unzip op
-        """
-        paddle.enable_static()
-        if core.is_compiled_with_cuda():
-            place = base.CUDAPlace(0)
-            x = paddle.static.data(name='X', shape=[6], dtype='float64')
-            lod = paddle.static.data(name='lod', shape=[6], dtype='int64')
-            len = 4
-            output = paddle.incubate.operators.unzip(x, lod, len)
-
-            input = [1.0, 2.0, 3.0, 1.0, 2.0, 4.0]
-            lod = [0, 3, 3, 3, 4, 6]
-
-            feed = {
-                'X': np.array(input).astype("float64"),
-                'lod': np.array(lod).astype("int64"),
-            }
-
-            exe = base.Executor(place=place)
-            exe.run(base.default_startup_program())
-            res = exe.run(feed=feed, fetch_list=[output])
-            out = [
-                [1.0, 2.0, 3.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0],
-                [1.0, 0.0, 0.0, 0.0],
-                [2.0, 4.0, 0.0, 0.0],
-            ]
-            out_np = np.array(out, dtype="float64")
-            assert (res == out_np).all(), "output is not right"
-
-
-class TestUnzipOp_Complex(unittest.TestCase):
-    def test_result(self):
-        """
-        For unzip op
-        """
-        self.dtype = self.get_dtype()
-        paddle.enable_static()
-        prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        with paddle.static.program_guard(prog, startup_prog):
-            if core.is_compiled_with_cuda():
-                place = base.CUDAPlace(0)
-                x = paddle.static.data(
-                    name='Complex64_X', shape=[6], dtype=self.dtype
-                )
-                lod = paddle.static.data(name='lodx', shape=[6], dtype='int64')
-                len = 4
-                output = paddle.incubate.operators.unzip(x, lod, len)
-                input = [
-                    1.0 + 1.0j,
-                    2.0 + 2.0j,
-                    3.0 + 3.0j,
-                    1.0 + 1.0j,
-                    2.0 + 2.0j,
-                    4.0 + 4.0j,
-                ]
-                lod = [0, 3, 3, 3, 4, 6]
-
-                feed = {
-                    'Complex64_X': np.array(input).astype(self.dtype),
-                    'lodx': np.array(lod).astype("int64"),
-                }
-
-                exe = base.Executor(place=place)
-                exe.run(base.default_startup_program())
-                res = exe.run(prog, feed=feed, fetch_list=[output])
-                out = [
-                    [1.0 + 1.0j, 2.0 + 2.0j, 3.0 + 3.0j, 0.0j],
-                    [0.0j, 0.0j, 0.0j, 0.0j],
-                    [0.0j, 0.0j, 0.0j, 0.0j],
-                    [1.0 + 1.0j, 0.0j, 0.0j, 0.0j],
-                    [2.0 + 2.0j, 4.0 + 4.0j, 0.0j, 0.0j],
-                ]
-                out_np = np.array(out, dtype=self.dtype)
-                assert (res == out_np).all(), "output is not right"
-
-    def get_dtype(self):
-        return np.complex64
-
-
-class TestUnzipOp_Complex128(TestUnzipOp_Complex):
-    def get_dtype(self):
-        return np.complex128
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/xpu/get_test_cover_info.py b/test/xpu/get_test_cover_info.py
index c6f3756a69456..628691711ccd8 100644
--- a/test/xpu/get_test_cover_info.py
+++ b/test/xpu/get_test_cover_info.py
@@ -87,8 +87,6 @@
     "grad_add_float32",  # no api for grad_add, skip
     "lamb_float16",
     "lars_momentum_float32",
-    "resnet_unit",
-    "resnet_unit_grad",
     "c_embedding_float32",  # unittests of collective ops do not using xpu testing framework
     "c_sync_comm_stream_float32",
     "c_sync_calc_stream_float32",
diff --git a/test/xpu/test_fused_resnet_basic_block_op_xpu.py b/test/xpu/test_fused_resnet_basic_block_op_xpu.py
deleted file mode 100644
index 83aa25f54018f..0000000000000
--- a/test/xpu/test_fused_resnet_basic_block_op_xpu.py
+++ /dev/null
@@ -1,307 +0,0 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-from get_test_cover_info import (
-    XPUOpTestWrapper,
-    create_test_class,
-    get_xpu_op_support_types,
-)
-from op_test import OpTest
-
-import paddle
-from paddle import base, nn
-from paddle.base import core
-from paddle.base.framework import default_main_program
-from paddle.incubate.xpu.resnet_block import ResNetBasicBlock
-
-
-class XPUTestResNetBasicBlockOp(XPUOpTestWrapper):
-    def __init__(self):
-        self.op_name = "resnet_basic_block"
-        self.use_dynamic_create_class = False
-
-    class TestResNetBasicBlockOp(OpTest):
-        def setUp(self):
-            self.dtype = self.in_type
-            self.place = paddle.XPUPlace(0)
-            self.__class__.op_type = "resnet_basic_block"
-            self.__class__.no_need_check_grad = True
-            self.getShape()
-            self.getDiff()
-            self.getShortcut()
-            paddle.set_default_dtype(self.dtype)
-
-            self.src = np.random.random(self.input_size).astype(self.dtype)
-            self.dout = np.random.random(self.output_size).astype(self.dtype)
-
-        def getShape(self):
-            self.in_channels = 8
-            self.out_channels = 8
-            self.stride = 1
-            self.input_size = [2, 8, 32, 32]  # NCHW
-            self.output_size = [2, 8, 32, 32]  # NCHW
-
-        def getDiff(self):
-            self.rtol = 1e-3
-            self.atol = 1e-3
-
-        def getShortcut(self):
-            self.has_shortcut = False
-
-        def Base(self):
-            conv1_weight = base.ParamAttr(
-                initializer=paddle.nn.initializer.XavierNormal(),
-                learning_rate=0.001,
-            )
-            conv2_weight = base.ParamAttr(
-                initializer=paddle.nn.initializer.XavierNormal(),
-                learning_rate=0.001,
-            )
-            conv3_weight = base.ParamAttr(
-                initializer=paddle.nn.initializer.XavierNormal(),
-                learning_rate=0.001,
-            )
-            bn1_weight = base.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=1.0)
-            )
-            bn1_bias = base.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=0.0)
-            )
-            bn2_weight = base.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=1.0)
-            )
-            bn2_bias = base.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=0.0)
-            )
-            bn3_weight = base.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=1.0)
-            )
-            bn3_bias = base.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=0.0)
-            )
-
-            self.conv1 = nn.Conv2D(
-                in_channels=self.in_channels,
-                out_channels=self.out_channels,
-                kernel_size=3,
-                stride=self.stride,
-                padding=1,
-                weight_attr=conv1_weight,
-                bias_attr=None,
-                data_format='NCHW',
-            )
-            self.bn1 = paddle.nn.BatchNorm(
-                self.out_channels,
-                act='relu',
-                param_attr=bn1_weight,
-                bias_attr=bn1_bias,
-                data_layout='NCHW',
-            )
-            self.conv2 = nn.Conv2D(
-                in_channels=self.out_channels,
-                out_channels=self.out_channels,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=conv2_weight,
-                bias_attr=None,
-                data_format='NCHW',
-            )
-            self.bn2 = paddle.nn.BatchNorm(
-                self.out_channels,
-                act=None,
-                param_attr=bn2_weight,
-                bias_attr=bn2_bias,
-                data_layout='NCHW',
-            )
-            self.conv3 = nn.Conv2D(
-                in_channels=self.in_channels,
-                out_channels=self.out_channels,
-                kernel_size=1,
-                stride=self.stride,
-                padding=0,
-                weight_attr=conv3_weight,
-                bias_attr=None,
-                data_format='NCHW',
-            )
-            self.bn3 = paddle.nn.BatchNorm(
-                self.out_channels,
-                act=None,
-                param_attr=bn3_weight,
-                bias_attr=bn3_bias,
-                data_layout='NCHW',
-            )
-            self.relu = nn.ReLU()
-
-            tensor_src = paddle.to_tensor(self.src, stop_gradient=False)
-            if self.has_shortcut:
-                z_out = self.bn3(self.conv3(tensor_src))
-            else:
-                z_out = tensor_src
-            bn1_out = self.bn1(self.conv1(tensor_src))
-            bn2_out = self.bn2(self.conv2(bn1_out))
-            result = self.relu(bn2_out + z_out)
-            paddle.autograd.backward(
-                [result], [paddle.to_tensor(self.dout)], True
-            )
-            return result, tensor_src.grad
-
-        def FusedResNetBasicBlock(self):
-            fused_conv1_weight = base.ParamAttr(
-                initializer=paddle.nn.initializer.XavierNormal(),
-                learning_rate=0.001,
-            )
-            fused_conv2_weight = base.ParamAttr(
-                initializer=paddle.nn.initializer.XavierNormal(),
-                learning_rate=0.001,
-            )
-            fused_conv3_weight = base.ParamAttr(
-                initializer=paddle.nn.initializer.XavierNormal(),
-                learning_rate=0.001,
-            )
-            fused_bn1_weight = base.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=1.0)
-            )
-            fused_bn1_bias = base.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=0.0)
-            )
-            fused_bn2_weight = base.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=1.0)
-            )
-            fused_bn2_bias = base.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=0.0)
-            )
-            fused_bn3_weight = base.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=1.0)
-            )
-            fused_bn3_bias = base.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=0.0)
-            )
-
-            if self.has_shortcut:
-                self.resnet_basic_block = ResNetBasicBlock(
-                    num_channels1=self.in_channels,
-                    num_filter1=self.out_channels,
-                    filter1_size=3,
-                    num_channels2=self.out_channels,
-                    num_filter2=self.out_channels,
-                    filter2_size=3,
-                    num_channels3=self.in_channels,
-                    num_filter3=self.out_channels,
-                    filter3_size=1,
-                    filter1_attr=fused_conv1_weight,
-                    scale1_attr=fused_bn1_weight,
-                    bias1_attr=fused_bn1_bias,
-                    filter2_attr=fused_conv2_weight,
-                    scale2_attr=fused_bn2_weight,
-                    bias2_attr=fused_bn2_bias,
-                    filter3_attr=fused_conv3_weight,
-                    scale3_attr=fused_bn3_weight,
-                    bias3_attr=fused_bn3_bias,
-                    stride1=self.stride,
-                    stride2=1,
-                    stride3=self.stride,
-                    act='relu',
-                    padding1=1,
-                    padding2=1,
-                    padding3=0,
-                    has_shortcut=True,
-                )
-            else:
-                self.resnet_basic_block = ResNetBasicBlock(
-                    num_channels1=self.in_channels,
-                    num_filter1=self.out_channels,
-                    filter1_size=3,
-                    num_channels2=self.out_channels,
-                    num_filter2=self.out_channels,
-                    filter2_size=3,
-                    num_channels3=self.in_channels,
-                    num_filter3=self.out_channels,
-                    filter3_size=1,
-                    filter1_attr=fused_conv1_weight,
-                    scale1_attr=fused_bn1_weight,
-                    bias1_attr=fused_bn1_bias,
-                    filter2_attr=fused_conv2_weight,
-                    scale2_attr=fused_bn2_weight,
-                    bias2_attr=fused_bn2_bias,
-                    filter3_attr=fused_conv3_weight,
-                    scale3_attr=fused_bn3_weight,
-                    bias3_attr=fused_bn3_bias,
-                    stride1=self.stride,
-                    stride2=1,
-                    stride3=self.stride,
-                    act='relu',
-                    padding1=1,
-                    padding2=1,
-                    padding3=1,
-                    has_shortcut=False,
-                )
-
-            x = paddle.to_tensor(self.src, stop_gradient=False)
-            out = self.resnet_basic_block.forward(x)
-            paddle.autograd.backward([out], [paddle.to_tensor(self.dout)])
-            return out, x.grad
-
-        def test_out_and_grad_has_shortcut(self):
-            self.has_shortcut = True
-            default_main_program().random_seed = 1
-            base_out, base_grad = self.Base()
-            fused_out, fused_grad = self.FusedResNetBasicBlock()
-            np.testing.assert_allclose(
-                base_out.numpy(),
-                fused_out.numpy(),
-                rtol=self.rtol,
-                atol=self.atol,
-            )
-            np.testing.assert_allclose(
-                base_grad.numpy(),
-                fused_grad.numpy(),
-                rtol=self.rtol,
-                atol=self.atol,
-            )
-
-        def test_out_and_grad(self):
-            self.has_shortcut = False
-            default_main_program().random_seed = 1
-            base_out, base_grad = self.Base()
-            fused_out, fused_grad = self.FusedResNetBasicBlock()
-            np.testing.assert_allclose(
-                base_out.numpy(),
-                fused_out.numpy(),
-                rtol=self.rtol,
-                atol=self.atol,
-            )
-            np.testing.assert_allclose(
-                base_grad.numpy(),
-                fused_grad.numpy(),
-                rtol=self.rtol,
-                atol=self.atol,
-            )
-
-
-support_types = get_xpu_op_support_types('resnet_basic_block')
-for stype in support_types:
-    create_test_class(
-        globals(),
-        XPUTestResNetBasicBlockOp,
-        stype,
-        ignore_device_version=[core.XPUVersion.XPU1],
-    )
-
-if __name__ == '__main__':
-    unittest.main()